# Upgrade -if needed
# !pip3 install openpyxl
# !pip3 install --upgrade joblib
# !pip3 install --upgrade matplotlib
# !pip3 install --upgrade numpy
# !pip3 install --upgrade pandas
# !pip3 install --upgrade scikit-learn
# !pip3 install --upgrade scipy
# !pip3 install --upgrade seaborn
# !pip3 install --upgrade xgboost
# !pip3 install --upgrade Keras
# !pip3 install --upgrade tensorflow
# not sure gia ta parakatw
# !pip3 install --upgrade hyperopt
# !pip3 install --upgrade scikit-optimize
# !pip3 install --upgrade optuna
# !pip3 install --upgrade xlrd
# Data management
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import scipy.stats as stats
# To display all columns in the dataset.
pd.set_option('display.max_columns', None)
# Visualisation
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# For the models that will be implemented
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
# Metrics that will be used to evaluate/train the models
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import ConfusionMatrixDisplay
# Optimization of the model - Bayesian optimization
from skopt import BayesSearchCV
from sklearn.tree import DecisionTreeClassifier
# For the feature selection
#there are required for SequentialFeatureSelector to be used
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import association_rules,apriori
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
# Interpretation
from sklearn.inspection import permutation_importance
# To avoid irrelevant (to the thesis) messages
import warnings
warnings.simplefilter("ignore", UserWarning)
import warnings
warnings.filterwarnings('ignore')
def analyze_column_and_export(for_segmentation, column_name, output_file):
# Ensure the necessary columns are available
if 'Defaulted' not in for_segmentation.columns:
raise ValueError("The dataset must contain a 'Defaulted' column.")
# Calculate "Percent of Total Frequency" for each unique value
total_count = for_segmentation[column_name].value_counts(normalize=True) * 100
# Calculate the "Frequency Count" of Goods (Defaulted = 0)
goods_count = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts()
# Calculate "Percent of Column Frequency" for the Goods
goods_percent = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts(normalize=True) * 100
# Calculate the "Frequency Count" of the Bads (Defaulted = 1)
bads_count = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts()
# Calculate "Percent of Column Frequency" for the Bads
bads_percent = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts(normalize=True) * 100
# Create a DataFrame with the calculated values
summary_df = pd.DataFrame({
column_name: total_count.index,
'Percent of Total Frequency': total_count.values,
'Frequency Count (Goods)': goods_count,
'Percent of Column Frequency (Goods)': goods_percent,
'Frequency Count (Bads)': bads_count,
'Percent of Column Frequency (Bads)': bads_percent,
}).fillna(0)
# Calculate the good-bad odds
summary_df['Good-Bad Odds'] = summary_df['Frequency Count (Goods)'] / summary_df['Frequency Count (Bads)']
summary_df['Good-Bad Odds'].replace([np.inf, -np.inf], 0, inplace=True)
# Calculate the bad rate and convert it to a percentage
summary_df['Bad Rate'] = (summary_df['Frequency Count (Bads)'] / (summary_df['Frequency Count (Goods)'] + summary_df['Frequency Count (Bads)'])) * 100
# Calculate the weight of evidence (WOE)
summary_df['WOE'] = np.log(np.where(summary_df['Percent of Column Frequency (Bads)'] == 0, np.nan, summary_df['Percent of Column Frequency (Goods)'] / summary_df['Percent of Column Frequency (Bads)']))
# Replace -inf, inf and NaN in WOE with 0
summary_df['WOE'].replace([np.inf, -np.inf], 0, inplace=True)
summary_df['WOE'].fillna(0, inplace=True)
# Calculate Information Value (IV)
summary_df['IV'] = (summary_df['Percent of Column Frequency (Goods)'] - summary_df['Percent of Column Frequency (Bads)']) * summary_df['WOE']
# Fillna with 0 in IV
summary_df['IV'].fillna(0, inplace=True)
# Add the 'grpchar' column as a copy of the input column
summary_df['grpchar'] = summary_df[column_name]
# Reset index to avoid ambiguity
summary_df.reset_index(drop=True, inplace=True)
# Sort the DataFrame based on the specified column in ascending order
summary_df.sort_values(by=column_name, inplace=True)
# Save the summary DataFrame to an Excel file
summary_df.to_excel(output_file, index=False)
def calculate_iv(data, target_variable):
"""
Calculate the Information Value (IV) for all variables in the dataset that end with '_segm'.
Parameters:
data (pd.DataFrame): The dataset containing the segmented variables.
target_variable (pd.Series): The target variable where 1 indicates default and 0 indicates non-default.
Returns:
pd.DataFrame: A DataFrame containing the IV values for each segmented variable.
"""
iv_list = []
segmented_columns = [col for col in data.columns if col.endswith('_segm')]
for column in segmented_columns:
# Create a cross-tabulation of the segmented variable with the target variable
cross_tab = pd.crosstab(data[column], target_variable)
# Calculate the distribution of goods and bads
cross_tab.columns = ['non_default', 'default']
cross_tab['total'] = cross_tab['non_default'] + cross_tab['default']
cross_tab['non_default_dist'] = cross_tab['non_default'] / cross_tab['non_default'].sum()
cross_tab['default_dist'] = cross_tab['default'] / cross_tab['default'].sum()
# Calculate WoE and IV
cross_tab['woe'] = np.log(cross_tab['non_default_dist'] / cross_tab['default_dist'])
cross_tab['iv'] = (cross_tab['non_default_dist'] - cross_tab['default_dist']) * cross_tab['woe']*100
# Sum the IV values to get the IV for the variable
iv_value = cross_tab['iv'].sum()
iv_list.append({'Variable': column, 'IV': iv_value})
# Create a DataFrame from the IV list
iv_df = pd.DataFrame(iv_list).sort_values(by='IV', ascending=False)
return iv_df
Function that calculates KS, AUC & Gini -on an already fitted model
def calculate_DiscriminatoryStats(X, y, model, dataset_name):
# Getting the predicted probabilities
pred_prob_y = model.predict_proba(X)[:, 1]
# probability of being good
pred_prob_y_good = 1 - pred_prob_y
# create the dataframe with rounded probabilities and calculate CreditScore
df = pd.DataFrame({'Prob_Good': np.round(pred_prob_y_good, 2), 'y': y})
df['Credit Score'] = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
# Find the minimum positive credit score
min_positive_credit_score = df[df['Credit Score'] > 0]['Credit Score'].min()
# Apply the cap to the credit scores
df['Credit Score'] = df['Credit Score'].apply(lambda x: min_positive_credit_score if x <= 0 else x)
# Calculate the number of applicants per credit score
df_summary = df.groupby('Credit Score').size().reset_index(name='num_applicants')
# Calculate the counts of goods and bads from ground truth
good_counts = df[df['y'] == 0].groupby('Credit Score').size().reset_index(name='num_goods')
bad_counts = df[df['y'] == 1].groupby('Credit Score').size().reset_index(name='num_bads')
df_summary = df_summary.merge(good_counts, on='Credit Score', how='left').merge(bad_counts, on='Credit Score', how='left').fillna(0)
# Add the total column
df_summary['total'] = df_summary['num_goods'] + df_summary['num_bads']
# Calculate the cumulative frequencies of goods and bads
df_summary['cum_freq_goods'] = df_summary['num_goods'].cumsum()
df_summary['cum_freq_bads'] = df_summary['num_bads'].cumsum()
# Calculate the % of total goods and bads
total_goods = df_summary['num_goods'].sum()
total_bads = df_summary['num_bads'].sum()
df_summary['perc_total_goods'] = (df_summary['num_goods'] / total_goods * 100).round(2)
df_summary['perc_total_bads'] = (df_summary['num_bads'] / total_bads * 100).round(2)
# Calculate the cumulative percent of goods and bads
df_summary['cum_perc_goods'] = (df_summary['cum_freq_goods'] / total_goods * 100).round(2)
df_summary['cum_perc_bads'] = (df_summary['cum_freq_bads'] / total_bads * 100).round(2)
# Calculate the difference between the cumulative distributions
df_summary['Separation'] = (df_summary['cum_perc_goods'] - df_summary['cum_perc_bads']).round(2)
# Calculate the Kolmogorov-Smirnov statistic
ks_statistic = df_summary['Separation'].abs().max()
# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y, pred_prob_y)
auc_metric = auc(fpr, tpr)
# Calculate Gini coefficient
gini_metric = 2 * auc_metric - 1
# Display the final dataframe
print(df_summary)
# Plotting the cumulative percentages of goods and bads
fig, ax = plt.subplots(1, 2, figsize=(18, 6))
ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_goods'], label='Cumulative % of Goods', marker='o')
ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_bads'], label='Cumulative % of Bads', marker='o')
ax[0].set_xlabel('Credit Score')
ax[0].set_ylabel('Cumulative Percentage')
ax[0].set_title('Cumulative Percentage of Goods and Bads')
ax[0].legend()
ax[0].grid(True)
# Plotting the ROC curve
ax[1].plot(fpr, tpr, label=f'ROC curve (area = {auc_metric:.2f})')
ax[1].plot([0, 1], [0, 1], 'k--', label='45 degree line')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_title('Receiver Operating Characteristic (ROC) Curve')
ax[1].legend()
ax[1].grid(True)
plt.show()
# Print the KS statistic, AUC metric, and Gini metric
print(f'The Kolmogorov-Smirnov statistic on the {dataset_name} data is: {ks_statistic:.2f}')
print(f'AUC metric on the {dataset_name} data is: {auc_metric:.2f}')
print(f'Gini metric on the {dataset_name} data is: {gini_metric:.2f}')
Function that will be used for the Population Stability Index caclulation
(comparison of the Scoring distribution between the training and test data)
def calculate_credit_scores(X, model):
pred_prob_y = model.predict_proba(X)[:, 1]
pred_prob_y_good = 1 - pred_prob_y
credit_scores = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
return credit_scores
def create_scoring_pools(train_scores):
percentiles = np.percentile(train_scores, [10, 20, 30, 40, 50, 60, 70, 80, 90])
bins = [-np.inf] + list(percentiles) + [np.inf]
labels = []
for i in range(len(bins)-1):
if bins[i] == -np.inf:
labels.append(f'<={int(bins[i+1])}')
elif bins[i+1] == np.inf:
labels.append(f'>{int(bins[i])}')
else:
labels.append(f'{int(bins[i]+1)} - {int(bins[i+1])}')
scoring_pools = pd.cut(train_scores, bins=bins, labels=labels)
return scoring_pools, bins, labels
def calculate_psi(train_pools, test_scores, bins, labels):
train_dist = train_pools.value_counts().sort_index() / len(train_pools)
test_pools = pd.cut(test_scores, bins=bins, labels=labels)
test_dist = test_pools.value_counts().sort_index() / len(test_pools)
psi_values = (train_dist - test_dist) * np.log(train_dist / test_dist)
psi_stat = psi_values.sum()
return train_dist, test_dist, psi_stat
def plot_distribution(train_dist, test_dist, labels):
df = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
df.index = labels
df.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Scoring Pools')
plt.ylabel('Proportion')
plt.title('Distribution of Scoring Pools in Train and Test Sets')
plt.grid(True)
plt.show()
def calculate_and_plot_psi(train_X, test_X, model, train_name, test_name):
train_scores = calculate_credit_scores(train_X, model)
test_scores = calculate_credit_scores(test_X, model)
train_pools, bins, labels = create_scoring_pools(train_scores)
train_dist, test_dist, psi_stat = calculate_psi(train_pools, test_scores, bins, labels)
plot_distribution(train_dist, test_dist, labels)
print(f'The PSI statistic between {train_name} and {test_name} sets is: {psi_stat:.3f}')
if psi_stat < 0.1:
print(f'No significant shift in the population (PSI = {psi_stat:.3f})')
elif psi_stat < 0.25:
print(f'Moderate shift in the population (PSI = {psi_stat:.3f})')
else:
print(f'Significant shift in the population (PSI = {psi_stat:.3f})')
def calculate_iv_comparison(train_data, test_data, y_train, y_test):
"""
Calculate and compare the Information Value (IV) for all variables in the training and test datasets that end with '_segm'.
Parameters:
train_data (pd.DataFrame): The training dataset containing the segmented variables.
test_data (pd.DataFrame): The test dataset containing the segmented variables.
y_train (pd.Series): The target variable for the training dataset where 1 indicates default and 0 indicates non-default.
y_test (pd.Series): The target variable for the test dataset where 1 indicates default and 0 indicates non-default.
Returns:
pd.DataFrame: A DataFrame containing the IV values for each segmented variable in both datasets.
"""
iv_list = []
segmented_columns = [col for col in train_data.columns]
for column in segmented_columns:
# Training data
cross_tab_train = pd.crosstab(train_data[column], y_train)
cross_tab_train.columns = ['non_default', 'default']
cross_tab_train['total'] = cross_tab_train['non_default'] + cross_tab_train['default']
cross_tab_train['non_default_dist'] = cross_tab_train['non_default'] / cross_tab_train['non_default'].sum()
cross_tab_train['default_dist'] = cross_tab_train['default'] / cross_tab_train['default'].sum()
cross_tab_train['woe'] = np.log(cross_tab_train['non_default_dist'] / cross_tab_train['default_dist'])
cross_tab_train['iv'] = (cross_tab_train['non_default_dist'] - cross_tab_train['default_dist']) * cross_tab_train['woe']*100
iv_train = np.round(cross_tab_train['iv'].sum(),2)
# Test data
cross_tab_test = pd.crosstab(test_data[column], y_test)
cross_tab_test.columns = ['non_default', 'default']
cross_tab_test['total'] = cross_tab_test['non_default'] + cross_tab_test['default']
cross_tab_test['non_default_dist'] = cross_tab_test['non_default'] / cross_tab_test['non_default'].sum()
cross_tab_test['default_dist'] = cross_tab_test['default'] / cross_tab_test['default'].sum()
cross_tab_test['woe'] = np.log(cross_tab_test['non_default_dist'] / cross_tab_test['default_dist'])
cross_tab_test['iv'] = (cross_tab_test['non_default_dist'] - cross_tab_test['default_dist']) * cross_tab_test['woe']*100
iv_test = np.round(cross_tab_test['iv'].sum(), 2)
iv_list.append({
'Variable': column,
'IV_Train': iv_train,
'IV_Test': iv_test
})
# Create a DataFrame from the IV list
iv_df = pd.DataFrame(iv_list).sort_values(by='IV_Train', ascending=False)
return iv_df
KS/AUC/Gini
PSI
def calculate_DiscriminatoryStats_nn(X, y, model, dataset_name):
# Getting the predicted probabilities
pred_prob_y = model.predict(X).flatten()
# probability of being good
pred_prob_y_good = 1 - pred_prob_y
# create the dataframe with rounded probabilities and calculate CreditScore
df = pd.DataFrame({'Prob_Good': np.round(pred_prob_y_good, 2), 'y': y})
df['Credit Score'] = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
# Find the minimum positive credit score
min_positive_credit_score = df[df['Credit Score'] > 0]['Credit Score'].min()
# Apply the cap to the credit scores
df['Credit Score'] = df['Credit Score'].apply(lambda x: min_positive_credit_score if x <= 0 else x)
# Calculate the number of applicants per credit score
df_summary = df.groupby('Credit Score').size().reset_index(name='num_applicants')
# Calculate the counts of goods and bads from ground truth
good_counts = df[df['y'] == 0].groupby('Credit Score').size().reset_index(name='num_goods')
bad_counts = df[df['y'] == 1].groupby('Credit Score').size().reset_index(name='num_bads')
df_summary = df_summary.merge(good_counts, on='Credit Score', how='left').merge(bad_counts, on='Credit Score', how='left').fillna(0)
# Add the total column
df_summary['total'] = df_summary['num_goods'] + df_summary['num_bads']
# Calculate the cumulative frequencies of goods and bads
df_summary['cum_freq_goods'] = df_summary['num_goods'].cumsum()
df_summary['cum_freq_bads'] = df_summary['num_bads'].cumsum()
# Calculate the % of total goods and bads
total_goods = df_summary['num_goods'].sum()
total_bads = df_summary['num_bads'].sum()
df_summary['perc_total_goods'] = (df_summary['num_goods'] / total_goods * 100).round(2)
df_summary['perc_total_bads'] = (df_summary['num_bads'] / total_bads * 100).round(2)
# Calculate the cumulative percent of goods and bads
df_summary['cum_perc_goods'] = (df_summary['cum_freq_goods'] / total_goods * 100).round(2)
df_summary['cum_perc_bads'] = (df_summary['cum_freq_bads'] / total_bads * 100).round(2)
# Calculate the difference between the cumulative distributions
df_summary['Separation'] = (df_summary['cum_perc_goods'] - df_summary['cum_perc_bads']).round(2)
# Calculate the Kolmogorov-Smirnov statistic
ks_statistic = df_summary['Separation'].abs().max()
# Calculate ROC curve and AUC
fpr, tpr, _ = roc_curve(y, pred_prob_y)
auc_metric = auc(fpr, tpr)
# Calculate Gini coefficient
gini_metric = 2 * auc_metric - 1
# Display the final dataframe
print(df_summary)
# Plotting the cumulative percentages of goods and bads
fig, ax = plt.subplots(1, 2, figsize=(18, 6))
ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_goods'], label='Cumulative % of Goods', marker='o')
ax[0].plot(df_summary['Credit Score'], df_summary['cum_perc_bads'], label='Cumulative % of Bads', marker='o')
ax[0].set_xlabel('Credit Score')
ax[0].set_ylabel('Cumulative Percentage')
ax[0].set_title('Cumulative Percentage of Goods and Bads')
ax[0].legend()
ax[0].grid(True)
# Plotting the ROC curve
ax[1].plot(fpr, tpr, label=f'ROC curve (area = {auc_metric:.2f})')
ax[1].plot([0, 1], [0, 1], 'k--', label='45 degree line')
ax[1].set_xlabel('False Positive Rate')
ax[1].set_ylabel('True Positive Rate')
ax[1].set_title('Receiver Operating Characteristic (ROC) Curve')
ax[1].legend()
ax[1].grid(True)
plt.show()
# Print the KS statistic, AUC metric, and Gini metric
print(f'The Kolmogorov-Smirnov statistic on the {dataset_name} data is: {ks_statistic:.2f}')
print(f'AUC metric on the {dataset_name} data is: {auc_metric:.2f}')
print(f'Gini metric on the {dataset_name} data is: {gini_metric:.2f}')
def calculate_credit_scores_nn(X, model):
# Get predicted probabilities using model.predict
pred_prob_y = model.predict(X).flatten()
pred_prob_y_good = 1 - pred_prob_y
credit_scores = (400 + np.round(28.85 * np.log((pred_prob_y_good) / (1 - pred_prob_y_good)), 0)).astype(int)
return credit_scores
def create_scoring_pools_nn(train_scores):
percentiles = np.percentile(train_scores, [10, 20, 30, 40, 50, 60, 70, 80, 90])
bins = [-np.inf] + list(percentiles) + [np.inf]
labels = []
for i in range(len(bins)-1):
if bins[i] == -np.inf:
labels.append(f'<={int(bins[i+1])}')
elif bins[i+1] == np.inf:
labels.append(f'>{int(bins[i])}')
else:
labels.append(f'{int(bins[i]+1)} - {int(bins[i+1])}')
scoring_pools = pd.cut(train_scores, bins=bins, labels=labels)
return scoring_pools, bins, labels
def calculate_psi_nn(train_pools, test_scores, bins, labels):
train_dist = train_pools.value_counts().sort_index() / len(train_pools)
test_pools = pd.cut(test_scores, bins=bins, labels=labels)
test_dist = test_pools.value_counts().sort_index() / len(test_pools)
psi_values = (train_dist - test_dist) * np.log(train_dist / test_dist)
psi_stat = psi_values.sum()
return train_dist, test_dist, psi_stat
def plot_distribution_nn(train_dist, test_dist, labels):
df = pd.DataFrame({'Train': train_dist, 'Test': test_dist})
df.index = labels
df.plot(kind='bar', figsize=(12, 6))
plt.xlabel('Scoring Pools')
plt.ylabel('Proportion')
plt.title('Distribution of Scoring Pools in Train and Test Sets')
plt.grid(True)
plt.show()
def calculate_and_plot_psi_nn(train_X, test_X, model, train_name, test_name):
train_scores = calculate_credit_scores_nn(train_X, model)
test_scores = calculate_credit_scores_nn(test_X, model)
train_pools, bins, labels = create_scoring_pools_nn(train_scores)
train_dist, test_dist, psi_stat = calculate_psi_nn(train_pools, test_scores, bins, labels)
plot_distribution_nn(train_dist, test_dist, labels)
print(f'The PSI statistic between {train_name} and {test_name} sets is: {psi_stat:.3f}')
if psi_stat < 0.1:
print(f'No significant shift in the population (PSI = {psi_stat:.3f})')
elif psi_stat < 0.25:
print(f'Moderate shift in the population (PSI = {psi_stat:.3f})')
else:
print(f'Significant shift in the population (PSI = {psi_stat:.3f})')
# Import the data
dataset = pd.read_csv("D:/Thesis data/accepted_2007_to_2018Q4.csv")
# Display the first few rows
print(dataset.shape)
dataset.head()
(2260701, 151)
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | last_fico_range_high | last_fico_range_low | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | bc_open_to_buy | bc_util | chargeoff_within_12_mths | delinq_amnt | mo_sin_old_il_acct | mo_sin_old_rev_tl_op | mo_sin_rcnt_rev_tl_op | mo_sin_rcnt_tl | mort_acc | mths_since_recent_bc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_actv_bc_tl | num_actv_rev_tl | num_bc_sats | num_bc_tl | num_il_tl | num_op_rev_tl | num_rev_accts | num_rev_tl_bal_gt_0 | num_sats | num_tl_120dpd_2m | num_tl_30dpd | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | revol_bal_joint | sec_app_fico_range_low | sec_app_fico_range_high | sec_app_earliest_cr_line | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | sec_app_mths_since_last_major_derog | hardship_flag | hardship_type | hardship_reason | hardship_status | deferral_term | hardship_amount | hardship_start_date | hardship_end_date | payment_plan_start_date | hardship_length | hardship_dpd | hardship_loan_status | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | disbursement_method | debt_settlement_flag | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 68407277 | NaN | 3600.0 | 3600.0 | 3600.0 | 36 months | 13.99 | 123.03 | C | C4 | leadman | 10+ years | MORTGAGE | 55000.0 | Not Verified | Dec-2015 | Fully Paid | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | debt_consolidation | Debt consolidation | 190xx | PA | 5.91 | 0.0 | Aug-2003 | 675.0 | 679.0 | 1.0 | 30.0 | NaN | 7.0 | 0.0 | 2765.0 | 29.7 | 13.0 | w | 0.00 | 0.00 | 4421.723917 | 4421.72 | 3600.00 | 821.72 | 0.0 | 0.0 | 0.0 | Jan-2019 | 122.67 | NaN | Mar-2019 | 564.0 | 560.0 | 0.0 | 30.0 | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 722.0 | 144904.0 | 2.0 | 2.0 | 0.0 | 1.0 | 21.0 | 4981.0 | 36.0 | 3.0 | 3.0 | 722.0 | 34.0 | 9300.0 | 3.0 | 1.0 | 4.0 | 4.0 | 20701.0 | 1506.0 | 37.2 | 0.0 | 0.0 | 148.0 | 128.0 | 3.0 | 3.0 | 1.0 | 4.0 | 69.0 | 4.0 | 69.0 | 2.0 | 2.0 | 4.0 | 2.0 | 5.0 | 3.0 | 4.0 | 9.0 | 4.0 | 7.0 | 0.0 | 0.0 | 0.0 | 3.0 | 76.9 | 0.0 | 0.0 | 0.0 | 178050.0 | 7746.0 | 2400.0 | 13734.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 68355089 | NaN | 24700.0 | 24700.0 | 24700.0 | 36 months | 11.99 | 820.28 | C | C1 | Engineer | 10+ years | MORTGAGE | 65000.0 | Not Verified | Dec-2015 | Fully Paid | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | small_business | Business | 577xx | SD | 16.06 | 1.0 | Dec-1999 | 715.0 | 719.0 | 4.0 | 6.0 | NaN | 22.0 | 0.0 | 21470.0 | 19.2 | 38.0 | w | 0.00 | 0.00 | 25679.660000 | 25679.66 | 24700.00 | 979.66 | 0.0 | 0.0 | 0.0 | Jun-2016 | 926.35 | NaN | Mar-2019 | 699.0 | 695.0 | 0.0 | NaN | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 204396.0 | 1.0 | 1.0 | 0.0 | 1.0 | 19.0 | 18005.0 | 73.0 | 2.0 | 3.0 | 6472.0 | 29.0 | 111800.0 | 0.0 | 0.0 | 6.0 | 4.0 | 9733.0 | 57830.0 | 27.1 | 0.0 | 0.0 | 113.0 | 192.0 | 2.0 | 2.0 | 4.0 | 2.0 | NaN | 0.0 | 6.0 | 0.0 | 5.0 | 5.0 | 13.0 | 17.0 | 6.0 | 20.0 | 27.0 | 5.0 | 22.0 | 0.0 | 0.0 | 0.0 | 2.0 | 97.4 | 7.7 | 0.0 | 0.0 | 314017.0 | 39475.0 | 79300.0 | 24667.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 68341763 | NaN | 20000.0 | 20000.0 | 20000.0 | 60 months | 10.78 | 432.66 | B | B4 | truck driver | 10+ years | MORTGAGE | 63000.0 | Not Verified | Dec-2015 | Fully Paid | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | home_improvement | NaN | 605xx | IL | 10.78 | 0.0 | Aug-2000 | 695.0 | 699.0 | 0.0 | NaN | NaN | 6.0 | 0.0 | 7869.0 | 56.2 | 18.0 | w | 0.00 | 0.00 | 22705.924294 | 22705.92 | 20000.00 | 2705.92 | 0.0 | 0.0 | 0.0 | Jun-2017 | 15813.30 | NaN | Mar-2019 | 704.0 | 700.0 | 0.0 | NaN | 1.0 | Joint App | 71000.0 | 13.85 | Not Verified | 0.0 | 0.0 | 189699.0 | 0.0 | 1.0 | 0.0 | 4.0 | 19.0 | 10827.0 | 73.0 | 0.0 | 2.0 | 2081.0 | 65.0 | 14000.0 | 2.0 | 5.0 | 1.0 | 6.0 | 31617.0 | 2737.0 | 55.9 | 0.0 | 0.0 | 125.0 | 184.0 | 14.0 | 14.0 | 5.0 | 101.0 | NaN | 10.0 | NaN | 0.0 | 2.0 | 3.0 | 2.0 | 4.0 | 6.0 | 4.0 | 7.0 | 3.0 | 6.0 | 0.0 | 0.0 | 0.0 | 0.0 | 100.0 | 50.0 | 0.0 | 0.0 | 218418.0 | 18696.0 | 6200.0 | 14877.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 66310712 | NaN | 35000.0 | 35000.0 | 35000.0 | 60 months | 14.85 | 829.90 | C | C5 | Information Systems Officer | 10+ years | MORTGAGE | 110000.0 | Source Verified | Dec-2015 | Current | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | debt_consolidation | Debt consolidation | 076xx | NJ | 17.06 | 0.0 | Sep-2008 | 785.0 | 789.0 | 0.0 | NaN | NaN | 13.0 | 0.0 | 7802.0 | 11.6 | 17.0 | w | 15897.65 | 15897.65 | 31464.010000 | 31464.01 | 19102.35 | 12361.66 | 0.0 | 0.0 | 0.0 | Feb-2019 | 829.90 | Apr-2019 | Mar-2019 | 679.0 | 675.0 | 0.0 | NaN | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 301500.0 | 1.0 | 1.0 | 0.0 | 1.0 | 23.0 | 12609.0 | 70.0 | 1.0 | 1.0 | 6987.0 | 45.0 | 67300.0 | 0.0 | 1.0 | 0.0 | 2.0 | 23192.0 | 54962.0 | 12.1 | 0.0 | 0.0 | 36.0 | 87.0 | 2.0 | 2.0 | 1.0 | 2.0 | NaN | NaN | NaN | 0.0 | 4.0 | 5.0 | 8.0 | 10.0 | 2.0 | 10.0 | 13.0 | 5.0 | 13.0 | 0.0 | 0.0 | 0.0 | 1.0 | 100.0 | 0.0 | 0.0 | 0.0 | 381215.0 | 52226.0 | 62500.0 | 18000.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 68476807 | NaN | 10400.0 | 10400.0 | 10400.0 | 60 months | 22.45 | 289.91 | F | F1 | Contract Specialist | 3 years | MORTGAGE | 104433.0 | Source Verified | Dec-2015 | Fully Paid | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | major_purchase | Major purchase | 174xx | PA | 25.37 | 1.0 | Jun-1998 | 695.0 | 699.0 | 3.0 | 12.0 | NaN | 12.0 | 0.0 | 21929.0 | 64.5 | 35.0 | w | 0.00 | 0.00 | 11740.500000 | 11740.50 | 10400.00 | 1340.50 | 0.0 | 0.0 | 0.0 | Jul-2016 | 10128.96 | NaN | Mar-2018 | 704.0 | 700.0 | 0.0 | NaN | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 331730.0 | 1.0 | 3.0 | 0.0 | 3.0 | 14.0 | 73839.0 | 84.0 | 4.0 | 7.0 | 9702.0 | 78.0 | 34000.0 | 2.0 | 1.0 | 3.0 | 10.0 | 27644.0 | 4567.0 | 77.5 | 0.0 | 0.0 | 128.0 | 210.0 | 4.0 | 4.0 | 6.0 | 4.0 | 12.0 | 1.0 | 12.0 | 0.0 | 4.0 | 6.0 | 5.0 | 9.0 | 10.0 | 7.0 | 19.0 | 6.0 | 12.0 | 0.0 | 0.0 | 0.0 | 4.0 | 96.6 | 60.0 | 0.0 | 0.0 | 439570.0 | 95768.0 | 20300.0 | 88097.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
dataset = pd.read_csv("D:/Thesis data/accepted_2007_to_2018Q4.csv", parse_dates=['issue_d'], infer_datetime_format=True)
dataset = dataset[(dataset.issue_d >= '2018-01-01 00:00:00') & (dataset.issue_d < '2018-07-01 00:00:00')]
dataset = dataset.reset_index(drop=True)
dataset.head()
| id | member_id | loan_amnt | funded_amnt | funded_amnt_inv | term | int_rate | installment | grade | sub_grade | emp_title | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | pymnt_plan | url | desc | purpose | title | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | initial_list_status | out_prncp | out_prncp_inv | total_pymnt | total_pymnt_inv | total_rec_prncp | total_rec_int | total_rec_late_fee | recoveries | collection_recovery_fee | last_pymnt_d | last_pymnt_amnt | next_pymnt_d | last_credit_pull_d | last_fico_range_high | last_fico_range_low | collections_12_mths_ex_med | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | verification_status_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | mths_since_rcnt_il | total_bal_il | il_util | open_rv_12m | open_rv_24m | max_bal_bc | all_util | total_rev_hi_lim | inq_fi | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | bc_open_to_buy | bc_util | chargeoff_within_12_mths | delinq_amnt | mo_sin_old_il_acct | mo_sin_old_rev_tl_op | mo_sin_rcnt_rev_tl_op | mo_sin_rcnt_tl | mort_acc | mths_since_recent_bc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_actv_bc_tl | num_actv_rev_tl | num_bc_sats | num_bc_tl | num_il_tl | num_op_rev_tl | num_rev_accts | num_rev_tl_bal_gt_0 | num_sats | num_tl_120dpd_2m | num_tl_30dpd | num_tl_90g_dpd_24m | num_tl_op_past_12m | pct_tl_nvr_dlq | percent_bc_gt_75 | pub_rec_bankruptcies | tax_liens | tot_hi_cred_lim | total_bal_ex_mort | total_bc_limit | total_il_high_credit_limit | revol_bal_joint | sec_app_fico_range_low | sec_app_fico_range_high | sec_app_earliest_cr_line | sec_app_inq_last_6mths | sec_app_mort_acc | sec_app_open_acc | sec_app_revol_util | sec_app_open_act_il | sec_app_num_rev_accts | sec_app_chargeoff_within_12_mths | sec_app_collections_12_mths_ex_med | sec_app_mths_since_last_major_derog | hardship_flag | hardship_type | hardship_reason | hardship_status | deferral_term | hardship_amount | hardship_start_date | hardship_end_date | payment_plan_start_date | hardship_length | hardship_dpd | hardship_loan_status | orig_projected_additional_accrued_interest | hardship_payoff_balance_amount | hardship_last_payment_amount | disbursement_method | debt_settlement_flag | debt_settlement_flag_date | settlement_status | settlement_date | settlement_amount | settlement_percentage | settlement_term | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 130954621 | NaN | 5000.0 | 5000.0 | 5000.0 | 36 months | 20.39 | 186.82 | D | D4 | General Manager | 8 years | RENT | 50000.0 | Verified | 2018-03-01 | Current | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | other | Other | 740xx | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 669.0 | 0.0 | 9.0 | NaN | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | w | 3780.31 | 3780.31 | 2043.690000 | 2043.69 | 1219.69 | 824.00 | 0.0 | 0.0 | 0.0 | Mar-2019 | 186.82 | Apr-2019 | Mar-2019 | 609.0 | 605.0 | 0.0 | 9.0 | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 16.0 | 14118.0 | 51.0 | 1.0 | 2.0 | 85.0 | 58.0 | 500.0 | 9.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 384.0 | 23.2 | 1.0 | 0.0 | 80.0 | 13.0 | 11.0 | 11.0 | 0.0 | 11.0 | NaN | 2.0 | NaN | 4.0 | 2.0 | 2.0 | 2.0 | 2.0 | 15.0 | 2.0 | 2.0 | 2.0 | 5.0 | 0.0 | 0.0 | 1.0 | 1.0 | 77.8 | 0.0 | 0.0 | 0.0 | 33430.0 | 19344.0 | 500.0 | 27820.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 130964697 | NaN | 15000.0 | 15000.0 | 15000.0 | 36 months | 9.92 | 483.45 | B | B2 | IT Director | 2 years | OWN | 196000.0 | Source Verified | 2018-03-01 | Current | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | debt_consolidation | Debt consolidation | 337xx | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 704.0 | 0.0 | 65.0 | NaN | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | w | 10878.50 | 10878.50 | 5301.420000 | 5301.42 | 4121.50 | 1179.92 | 0.0 | 0.0 | 0.0 | Feb-2019 | 483.45 | Apr-2019 | Mar-2019 | 694.0 | 690.0 | 0.0 | NaN | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 6.0 | 113470.0 | 59.0 | 4.0 | 12.0 | 10495.0 | 51.0 | 52400.0 | 4.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 7368.0 | 74.1 | 0.0 | 0.0 | 141.0 | 236.0 | 4.0 | 4.0 | 5.0 | 11.0 | NaN | 6.0 | NaN | 0.0 | 4.0 | 10.0 | 5.0 | 16.0 | 11.0 | 14.0 | 37.0 | 10.0 | 19.0 | 0.0 | 0.0 | 0.0 | 6.0 | 98.0 | 75.0 | 0.0 | 0.0 | 605228.0 | 137713.0 | 28500.0 | 147178.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 130955326 | NaN | 11200.0 | 11200.0 | 11200.0 | 60 months | 30.79 | 367.82 | G | G1 | Client services | < 1 year | RENT | 44000.0 | Not Verified | 2018-03-01 | Current | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | medical | Medical expenses | 030xx | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 669.0 | 2.0 | 6.0 | NaN | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | w | 10193.73 | 10193.73 | 4007.700000 | 4007.70 | 1006.27 | 3001.43 | 0.0 | 0.0 | 0.0 | Feb-2019 | 367.82 | Apr-2019 | Mar-2019 | 629.0 | 625.0 | 0.0 | 70.0 | 1.0 | Joint App | 81000.0 | 31.94 | Not Verified | 0.0 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 8.0 | 65647.0 | 89.0 | 1.0 | 1.0 | 1011.0 | 84.0 | 6200.0 | 8.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 632.0 | 66.7 | 0.0 | 0.0 | 124.0 | 128.0 | 5.0 | 5.0 | 0.0 | 34.0 | 35.0 | 0.0 | 35.0 | 1.0 | 2.0 | 3.0 | 2.0 | 3.0 | 8.0 | 4.0 | 6.0 | 3.0 | 8.0 | 0.0 | 0.0 | 0.0 | 2.0 | 71.4 | 0.0 | 0.0 | 0.0 | 80367.0 | 67173.0 | 1900.0 | 74167.0 | 7101.0 | 610.0 | 614.0 | Feb-2005 | 3.0 | 1.0 | 14.0 | 80.0 | 11.0 | 8.0 | 0.0 | 2.0 | 37.0 | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 130504052 | NaN | 25000.0 | 25000.0 | 25000.0 | 60 months | 21.85 | 688.35 | D | D5 | Asphalt Supervisor | 10+ years | MORTGAGE | 65000.0 | Source Verified | 2018-03-01 | Current | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | debt_consolidation | Debt consolidation | 361xx | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 669.0 | 1.0 | 22.0 | NaN | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | w | 22188.73 | 22188.73 | 7511.160000 | 7511.16 | 2811.27 | 4699.89 | 0.0 | 0.0 | 0.0 | Feb-2019 | 688.35 | Apr-2019 | Mar-2019 | 669.0 | 665.0 | 0.0 | 23.0 | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 16.0 | 8382.0 | 82.0 | 0.0 | 0.0 | 3237.0 | 90.0 | 8800.0 | 4.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 63.0 | 98.1 | 0.0 | 0.0 | 69.0 | 126.0 | 72.0 | 16.0 | 2.0 | 126.0 | NaN | 0.0 | 22.0 | 2.0 | 1.0 | 3.0 | 1.0 | 1.0 | 4.0 | 3.0 | 9.0 | 3.0 | 7.0 | 0.0 | 0.0 | 1.0 | 0.0 | 75.0 | 100.0 | 0.0 | 0.0 | 101234.0 | 17039.0 | 3300.0 | 10220.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 130956066 | NaN | 3000.0 | 3000.0 | 3000.0 | 36 months | 7.34 | 93.10 | A | A4 | Scale Technician | 9 years | RENT | 52000.0 | Source Verified | 2018-03-01 | Fully Paid | n | https://lendingclub.com/browse/loanDetail.acti... | NaN | major_purchase | Major purchase | 988xx | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 764.0 | 0.0 | 26.0 | NaN | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | w | 0.00 | 0.00 | 3011.577285 | 3011.58 | 3000.00 | 11.58 | 0.0 | 0.0 | 0.0 | May-2018 | 614.03 | NaN | Nov-2018 | 764.0 | 760.0 | 0.0 | NaN | 1.0 | Individual | NaN | NaN | NaN | 0.0 | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | 7.0 | 0.0 | NaN | 0.0 | 1.0 | 141.0 | 1.0 | 31000.0 | 1.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 30359.0 | 0.5 | 0.0 | 0.0 | 132.0 | 242.0 | 18.0 | 7.0 | 4.0 | 18.0 | NaN | 7.0 | NaN | 0.0 | 1.0 | 1.0 | 4.0 | 15.0 | 7.0 | 6.0 | 19.0 | 1.0 | 7.0 | 0.0 | 0.0 | 0.0 | 1.0 | 96.7 | 0.0 | 0.0 | 0.0 | 191216.0 | 141.0 | 30500.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | N | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | Cash | N | NaN | NaN | NaN | NaN | NaN | NaN |
dataset.shape
(238636, 151)
dataset.dtypes
id object
member_id float64
loan_amnt float64
funded_amnt float64
funded_amnt_inv float64
...
settlement_status object
settlement_date object
settlement_amount float64
settlement_percentage float64
settlement_term float64
Length: 151, dtype: object
The mapping of Good-Bad applicants (i.e. the creation of the Target variable) is made upon using the descriptions as given by LendingClub.com:
(https://www.kaggle.com/code/pavlofesenko/minimizing-risks-for-loan-investments)
dataset['Defaulted'] = dataset['loan_status'].isin(['Charged Off', 'Late (16-30 days)', 'Late (31-120 days)']).astype('int')
dataset['Defaulted'].value_counts()
Defaulted 0 225388 1 13248 Name: count, dtype: int64
#Total Bad Rate for the approved applications of 2018
dataset['Defaulted'].value_counts()/len(dataset['Defaulted'])
Defaulted 0 0.944484 1 0.055516 Name: count, dtype: float64
Hence:
Some variables are not available at the time of the loan request, such as the "Interest rate" of the loan (as it is determined by the financial institution based on the risk-level of the client). Additionally, some other variables such as the "Employment Title" & "URL" are categorical with too many values -and no meaningful insight could be obtained. Finally, some other variables will be dropped, since the information they contain is generally regarded of minimum importance -such as the "verification_status_joint", where the info exists for a very small percentage of the applications (since very few applications have co-applicants on the first place).
Furthermore, it is important to note that in credit risk, all the variables used on a model should have a clear and sound interpretation. Hence, the observed patterns between the Bad Rate and the values of each predictor should be interpretable.
id
desc
next_pymnt_d
verification_status_joint
sec_app_earliest_cr_line
hardship_type
hardship_reason
hardship_status
hardship_start_date
hardship_end_date
payment_plan_start_date
hardship_loan_status
debt_settlement_flag_date
settlement_status
settlement_date
cols_to_be_dropped_a = [
"id",
"desc",
"next_pymnt_d",
"verification_status_joint",
"sec_app_earliest_cr_line",
"hardship_type",
"hardship_reason",
"hardship_status",
"hardship_start_date",
"hardship_end_date",
"payment_plan_start_date",
"hardship_loan_status",
"debt_settlement_flag_date",
"settlement_status",
"settlement_date",
]
cols_to_be_dropped_b = [
"out_prncp",
"desc",
"initial_list_status",
"collection_recovery_fee",
"last_pymnt_d",
"last_pymnt_amnt",
"last_credit_pull_d",
"collections_12_mths_ex_med",
"mths_since_rcnt_il",
"max_bal_bc",
"all_util",
"inq_fi",
"bc_open_to_buy",
"bc_util",
"mo_sin_old_il_acct",
"mo_sin_old_rev_tl_op",
"mo_sin_rcnt_rev_tl_op",
"mo_sin_rcnt_tl",
"mths_since_recent_bc",
"num_actv_bc_tl",
"num_actv_rev_tl",
"num_bc_sats",
"num_bc_tl",
"num_il_tl",
"num_op_rev_tl",
"num_rev_tl_bal_gt_0",
"num_sats",
"num_tl_120dpd_2m",
"num_tl_30dpd",
"num_tl_90g_dpd_24m",
"num_tl_op_past_12m",
"pct_tl_nvr_dlq",
"percent_bc_gt_75",
"tax_liens",
"tot_hi_cred_lim",
"total_bc_limit",
"total_il_high_credit_limit",
"sec_app_fico_range_low",
"sec_app_fico_range_high",
"sec_app_earliest_cr_line",
"sec_app_inq_last_6mths",
"sec_app_mort_acc",
"sec_app_open_acc",
"sec_app_revol_util",
"sec_app_open_act_il",
"sec_app_num_rev_accts",
"sec_app_chargeoff_within_12_mths",
"sec_app_collections_12_mths_ex_med",
"sec_app_mths_since_last_major_derog",
"hardship_flag",
"hardship_type",
"hardship_reason",
"hardship_status",
"deferral_term",
"hardship_amount",
"hardship_start_date",
"hardship_end_date",
"payment_plan_start_date",
"hardship_length",
"hardship_dpd",
"hardship_loan_status",
"orig_projected_additional_accrued_interest",
"hardship_payoff_balance_amount",
"hardship_last_payment_amount",
"disbursement_method",
"debt_settlement_flag",
"debt_settlement_flag_date",
"settlement_status",
"settlement_date",
"settlement_amount",
"settlement_percentage",
"settlement_term",
]
The following variables may not have descreptions -but they are self-explanatory and can also provide us with very useful insights. Thus, we will proceed with analyzing these variables as well
annual_inc
addr_state
dti
delinq_2yrs
earliest_cr_line
fico_range_low
fico_range_high
last_fico_range_high
last_fico_range_low
mths_since_last_major_derog
application_type
annual_inc_joint
dti_joint
acc_now_delinq
il_util
inq_last_12m
acc_open_past_24mths
avg_cur_bal
chargeoff_within_12_mths
delinq_amnt
mort_acc
mths_since_recent_bc_dlq
mths_since_recent_inq
mths_since_recent_revol_delinq
num_accts_ever_120_pd
num_rev_accts
pub_rec_bankruptcies
total_bal_ex_mort
revol_bal_joint
Columns that have description but either do not have meaningful usage or will lead to selection bias
Same goes for the "Grade" characteristic, as the model estimate should be based on credit loan, bureau ,demographic , transactional data, that are hihgly interpretable.
cols_to_be_dropped_c = [
"id",
"member_id",
"funded_amnt_inv",
"int_rate",
"grade",
"sub_grade",
"emp_title",
"pymnt_plan",
"url",
"title",
"out_prncp_inv",
"total_pymnt_inv",
"total_rec_int",
"total_rec_late_fee",
"next_pymnt_d",
]
Drop the unwanted variables
# Combine all columns to be dropped into a single list
cols_to_be_dropped = list(set(cols_to_be_dropped_a + cols_to_be_dropped_b + cols_to_be_dropped_c))
# Drop the columns from the dataset
dataset = dataset.drop(columns=cols_to_be_dropped)
# Print the resulting dataset to confirm the columns have been dropped
print(dataset.shape)
dataset.head()
(238636, 64)
| loan_amnt | funded_amnt | term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | zip_code | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | total_rec_prncp | recoveries | last_fico_range_high | last_fico_range_low | mths_since_last_major_derog | policy_code | application_type | annual_inc_joint | dti_joint | acc_now_delinq | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | total_bal_il | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | chargeoff_within_12_mths | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_rev_accts | pub_rec_bankruptcies | total_bal_ex_mort | revol_bal_joint | Defaulted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5000.0 | 5000.0 | 36 months | 186.82 | 8 years | RENT | 50000.0 | Verified | 2018-03-01 | Current | other | 740xx | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 669.0 | 0.0 | 9.0 | NaN | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 1219.69 | 0.0 | 609.0 | 605.0 | 9.0 | 1.0 | Individual | NaN | NaN | 0.0 | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 14118.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 1.0 | 0.0 | 0.0 | NaN | 2.0 | NaN | 4.0 | 2.0 | 0.0 | 19344.0 | NaN | 0 |
| 1 | 15000.0 | 15000.0 | 36 months | 483.45 | 2 years | OWN | 196000.0 | Source Verified | 2018-03-01 | Current | debt_consolidation | 337xx | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 704.0 | 0.0 | 65.0 | NaN | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 4121.50 | 0.0 | 694.0 | 690.0 | NaN | 1.0 | Individual | NaN | NaN | 0.0 | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 113470.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 0.0 | 5.0 | NaN | 6.0 | NaN | 0.0 | 37.0 | 0.0 | 137713.0 | NaN | 0 |
| 2 | 11200.0 | 11200.0 | 60 months | 367.82 | < 1 year | RENT | 44000.0 | Not Verified | 2018-03-01 | Current | medical | 030xx | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 669.0 | 2.0 | 6.0 | NaN | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 1006.27 | 0.0 | 629.0 | 625.0 | 70.0 | 1.0 | Joint App | 81000.0 | 31.94 | 0.0 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 65647.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 0.0 | 35.0 | 0.0 | 35.0 | 1.0 | 6.0 | 0.0 | 67173.0 | 7101.0 | 0 |
| 3 | 25000.0 | 25000.0 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.0 | Source Verified | 2018-03-01 | Current | debt_consolidation | 361xx | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 669.0 | 1.0 | 22.0 | NaN | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 2811.27 | 0.0 | 669.0 | 665.0 | 23.0 | 1.0 | Individual | NaN | NaN | 0.0 | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 8382.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 0.0 | 2.0 | NaN | 0.0 | 22.0 | 2.0 | 9.0 | 0.0 | 17039.0 | NaN | 0 |
| 4 | 3000.0 | 3000.0 | 36 months | 93.10 | 9 years | RENT | 52000.0 | Source Verified | 2018-03-01 | Fully Paid | major_purchase | 988xx | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 764.0 | 0.0 | 26.0 | NaN | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 3000.00 | 0.0 | 764.0 | 760.0 | NaN | 1.0 | Individual | NaN | NaN | 0.0 | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | NaN | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 0.0 | 4.0 | NaN | 7.0 | NaN | 0.0 | 19.0 | 0.0 | 141.0 | NaN | 0 |
Continuous Vs Discrete Vs Categorical variables
# Exclude the target variable
exclude_columns = ['loan_status', 'Defaulted']
# For categorical variables
categorical = [var for var in dataset.columns if dataset[var].dtype == 'O'
and var not in exclude_columns]
# For discrete variables
discrete = [var for var in dataset.columns if dataset[var].dtype != 'O'
and var not in exclude_columns
and dataset[var].nunique() < 20
]
# For continuous variables
continuous = [var for var in dataset.columns if dataset[var].dtype != 'O'
and var not in discrete
and var not in exclude_columns
]
print('Categorical Variables: ', categorical)
Categorical Variables: ['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'zip_code', 'addr_state', 'earliest_cr_line', 'application_type']
print('Discrete Variables: ', discrete)
Discrete Variables: ['issue_d', 'inq_last_6mths', 'pub_rec', 'policy_code', 'acc_now_delinq', 'open_acc_6m', 'open_il_12m', 'chargeoff_within_12_mths', 'pub_rec_bankruptcies']
print('Continuous Variables: ', continuous)
Continuous Variables: ['loan_amnt', 'funded_amnt', 'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'fico_range_high', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'total_rec_prncp', 'recoveries', 'last_fico_range_high', 'last_fico_range_low', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_act_il', 'open_il_24m', 'total_bal_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'total_rev_hi_lim', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'delinq_amnt', 'mort_acc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'mths_since_recent_revol_delinq', 'num_accts_ever_120_pd', 'num_rev_accts', 'total_bal_ex_mort', 'revol_bal_joint']
num_feat = continuous
dataset[num_feat].nunique().sort_values()
open_il_24m 20 open_rv_12m 23 delinq_2yrs 23 mths_since_recent_inq 26 mort_acc 28 num_accts_ever_120_pd 35 open_rv_24m 36 fico_range_low 38 fico_range_high 38 inq_last_12m 41 acc_open_past_24mths 41 total_cu_tl 42 open_act_il 43 open_acc 70 last_fico_range_low 71 last_fico_range_high 72 num_rev_accts 89 delinq_amnt 107 total_acc 121 mths_since_last_record 127 mths_since_last_delinq 141 mths_since_recent_bc_dlq 147 mths_since_last_major_derog 148 mths_since_recent_revol_delinq 150 il_util 212 revol_util 1097 funded_amnt 1547 loan_amnt 1547 recoveries 1963 dti_joint 3808 tot_coll_amt 5552 total_rev_hi_lim 5751 annual_inc_joint 6477 dti 7704 annual_inc 18140 installment 21130 revol_bal_joint 27131 avg_cur_bal 49650 revol_bal 49801 total_rec_prncp 75633 total_bal_il 85610 total_bal_ex_mort 104425 total_pymnt 132825 tot_cur_bal 159030 dtype: int64
num_feat = discrete
dataset[num_feat].nunique().sort_values()
policy_code 1 acc_now_delinq 2 issue_d 6 inq_last_6mths 6 pub_rec_bankruptcies 7 open_il_12m 8 chargeoff_within_12_mths 8 open_acc_6m 15 pub_rec 16 dtype: int64
num_feat = categorical
dataset[num_feat].nunique().sort_values()
term 2 application_type 2 verification_status 3 home_ownership 4 emp_length 11 purpose 13 addr_state 50 earliest_cr_line 661 zip_code 886 dtype: int64
# Drop "policy_code", since it is a constant
dataset = dataset.drop(columns="policy_code")
dataset.shape
(238636, 63)
# Also remove it from the list of discrete variables
discrete.remove('policy_code')
Function for percentage calculation and cumulative percentages for each column
def freq(df):
"""
def freq(df):
to obtain the number of unique values
and their corresponding percentages for each column.
Returns:
dict: A dictionary where keys are column names and values are DataFrames
containing the frequency and percentage of unique values.
"""
result = {}
for col in df.columns:
value_counts = df[col].value_counts(dropna=False).sort_index()
percentages = (value_counts / len(df)) * 100
cumulative_percentages = percentages.cumsum()
freq_df = pd.DataFrame({
'Frequency': value_counts,
'Percentage': percentages,
'Cumulative Percentage': cumulative_percentages
})
result[col] = freq_df
return result
# obtain the frequency and percentage of unique values for each column
freq_results = freq(dataset)
for col in categorical:
print(f"Results for column: {col}")
print(freq_results[col])
print("\n")
Results for column: term
Frequency Percentage Cumulative Percentage
term
36 months 166918 69.946697 69.946697
60 months 71718 30.053303 100.000000
Results for column: emp_length
Frequency Percentage Cumulative Percentage
emp_length
1 year 15805 6.623058 6.623058
10+ years 79521 33.323136 39.946194
2 years 22483 9.421462 49.367656
3 years 20051 8.402337 57.769993
4 years 15550 6.516200 64.286193
5 years 15014 6.291591 70.577784
6 years 10805 4.527816 75.105600
7 years 8958 3.753834 78.859434
8 years 7612 3.189795 82.049230
9 years 6703 2.808880 84.858110
< 1 year 16238 6.804506 91.662616
NaN 19896 8.337384 100.000000
Results for column: home_ownership
Frequency Percentage Cumulative Percentage
home_ownership
ANY 18 0.007543 0.007543
MORTGAGE 113594 47.601368 47.608911
OWN 30917 12.955715 60.564626
RENT 94107 39.435374 100.000000
Results for column: verification_status
Frequency Percentage Cumulative Percentage
verification_status
Not Verified 91375 38.290535 38.290535
Source Verified 94744 39.702308 77.992843
Verified 52517 22.007157 100.000000
Results for column: purpose
Frequency Percentage Cumulative Percentage
purpose
car 2886 1.209373 1.209373
credit_card 57399 24.052951 25.262324
debt_consolidation 119224 49.960609 75.222934
home_improvement 17292 7.246182 82.469116
house 3819 1.600345 84.069461
major_purchase 6980 2.924957 86.994418
medical 3723 1.560117 88.554535
moving 1677 0.702744 89.257279
other 21043 8.818032 98.075311
renewable_energy 135 0.056572 98.131883
small_business 2589 1.084916 99.216799
vacation 1864 0.781106 99.997905
wedding 5 0.002095 100.000000
Results for column: zip_code
Frequency Percentage Cumulative Percentage
zip_code
010xx 372 0.155886 0.155886
011xx 147 0.061600 0.217486
012xx 100 0.041905 0.259391
013xx 72 0.030171 0.289562
014xx 183 0.076686 0.366248
... ... ... ...
995xx 278 0.116495 99.878895
996xx 136 0.056991 99.935886
997xx 92 0.038552 99.974438
998xx 47 0.019695 99.994133
999xx 14 0.005867 100.000000
[886 rows x 3 columns]
Results for column: addr_state
Frequency Percentage Cumulative Percentage
addr_state
AK 565 0.236762 0.236762
AL 2753 1.153640 1.390402
AR 1833 0.768115 2.158518
AZ 5842 2.448080 4.606597
CA 32004 13.411220 18.017818
CO 5447 2.282556 20.300374
CT 3802 1.593221 21.893595
DC 456 0.191086 22.084681
DE 664 0.278248 22.362929
FL 17809 7.462830 29.825760
GA 8001 3.352805 33.178565
HI 934 0.391391 33.569956
ID 797 0.333981 33.903937
IL 9415 3.945339 37.849277
IN 4174 1.749107 39.598384
KS 1843 0.772306 40.370690
KY 2256 0.945373 41.316063
LA 2536 1.062706 42.378769
MA 5452 2.284651 44.663420
MD 5793 2.427547 47.090967
ME 784 0.328534 47.419501
MI 6087 2.550747 49.970248
MN 3963 1.660688 51.630936
MO 3779 1.583583 53.214519
MS 1524 0.638630 53.853149
MT 688 0.288305 54.141454
NC 6799 2.849109 56.990563
ND 512 0.214553 57.205116
NE 1147 0.480648 57.685764
NH 1226 0.513753 58.199517
NJ 8363 3.504501 61.704018
NM 1220 0.511239 62.215257
NV 3658 1.532879 63.748135
NY 19126 8.014717 71.762852
OH 7778 3.259357 75.022210
OK 2249 0.942440 75.964649
OR 2917 1.222364 77.187013
PA 7646 3.204043 80.391056
RI 1089 0.456344 80.847399
SC 3096 1.297373 82.144773
SD 470 0.196953 82.341725
TN 3954 1.656917 83.998642
TX 19713 8.260698 92.259341
UT 1519 0.636534 92.895875
VA 6252 2.619890 95.515765
VT 590 0.247238 95.763003
WA 4973 2.083927 97.846930
WI 3182 1.333412 99.180342
WV 1476 0.618515 99.798857
WY 480 0.201143 100.000000
Results for column: earliest_cr_line
Frequency Percentage Cumulative Percentage
earliest_cr_line
Apr-1955 1 0.000419 0.000419
Apr-1963 3 0.001257 0.001676
Apr-1964 3 0.001257 0.002933
Apr-1965 10 0.004190 0.007124
Apr-1966 9 0.003771 0.010895
... ... ... ...
Sep-2010 787 0.329791 98.978779
Sep-2011 798 0.334401 99.313180
Sep-2012 599 0.251010 99.564190
Sep-2013 553 0.231734 99.795923
Sep-2014 487 0.204077 100.000000
[661 rows x 3 columns]
Results for column: application_type
Frequency Percentage Cumulative Percentage
application_type
Individual 204218 85.577197 85.577197
Joint App 34418 14.422803 100.000000
Zip code has no concentration and thus no solid predictions can be made. Thus, it will be dropped as well
# Drop "zip_code", since it has too many values and no concentration is identified in any specific value
dataset = dataset.drop(columns="zip_code")
# Also remove it from the list of discrete variables
categorical.remove('zip_code')
for col in categorical:
print(f"Missing values for column: {col}")
print(dataset[col].isnull().mean())
print("\n")
Missing values for column: term 0.0 Missing values for column: emp_length 0.08337384133156775 Missing values for column: home_ownership 0.0 Missing values for column: verification_status 0.0 Missing values for column: purpose 0.0 Missing values for column: addr_state 0.0 Missing values for column: earliest_cr_line 0.0 Missing values for column: application_type 0.0
for col in discrete:
print(f"Results for column: {col}")
print(freq_results[col])
print("\n")
Results for column: issue_d
Frequency Percentage Cumulative Percentage
issue_d
2018-01-01 36347 15.231147 15.231147
2018-02-01 32746 13.722154 28.953301
2018-03-01 38771 16.246920 45.200221
2018-04-01 42928 17.988904 63.189125
2018-05-01 46311 19.406544 82.595669
2018-06-01 41533 17.404331 100.000000
Results for column: inq_last_6mths
Frequency Percentage Cumulative Percentage
inq_last_6mths
0.0 158875 66.576292 66.576292
1.0 56996 23.884074 90.460366
2.0 17353 7.271744 97.732111
3.0 5157 2.161032 99.893143
4.0 188 0.078781 99.971924
5.0 67 0.028076 100.000000
Results for column: pub_rec
Frequency Percentage Cumulative Percentage
pub_rec
0.0 206809 86.662951 86.662951
1.0 29932 12.542952 99.205904
2.0 1239 0.519201 99.725104
3.0 368 0.154210 99.879314
4.0 154 0.064533 99.943848
5.0 66 0.027657 99.971505
6.0 29 0.012152 99.983657
7.0 17 0.007124 99.990781
8.0 7 0.002933 99.993714
9.0 6 0.002514 99.996229
10.0 2 0.000838 99.997067
13.0 3 0.001257 99.998324
15.0 1 0.000419 99.998743
19.0 1 0.000419 99.999162
24.0 1 0.000419 99.999581
52.0 1 0.000419 100.000000
Results for column: acc_now_delinq
Frequency Percentage Cumulative Percentage
acc_now_delinq
0.0 238610 99.989105 99.989105
1.0 26 0.010895 100.000000
Results for column: open_acc_6m
Frequency Percentage Cumulative Percentage
open_acc_6m
0.0 112222 47.026434 47.026434
1.0 72299 30.296770 77.323204
2.0 33227 13.923716 91.246920
3.0 13316 5.580047 96.826967
4.0 4827 2.022746 98.849713
5.0 1741 0.729563 99.579276
6.0 586 0.245562 99.824838
7.0 246 0.103086 99.927924
8.0 112 0.046933 99.974857
9.0 36 0.015086 99.989943
10.0 11 0.004610 99.994552
11.0 6 0.002514 99.997067
12.0 4 0.001676 99.998743
13.0 2 0.000838 99.999581
15.0 1 0.000419 100.000000
Results for column: open_il_12m
Frequency Percentage Cumulative Percentage
open_il_12m
0.0 132357 55.463970 55.463970
1.0 69795 29.247473 84.711443
2.0 25129 10.530264 95.241707
3.0 7771 3.256424 98.498131
4.0 2467 1.033792 99.531923
5.0 809 0.339010 99.870933
6.0 307 0.128648 99.999581
8.0 1 0.000419 100.000000
Results for column: chargeoff_within_12_mths
Frequency Percentage Cumulative Percentage
chargeoff_within_12_mths
0.0 237159 99.381066 99.381066
1.0 1374 0.575772 99.956838
2.0 81 0.033943 99.990781
3.0 13 0.005448 99.996229
4.0 6 0.002514 99.998743
6.0 1 0.000419 99.999162
7.0 1 0.000419 99.999581
9.0 1 0.000419 100.000000
Results for column: pub_rec_bankruptcies
Frequency Percentage Cumulative Percentage
pub_rec_bankruptcies
0.0 209476 87.780553 87.780553
1.0 28793 12.065656 99.846209
2.0 314 0.131581 99.977790
3.0 40 0.016762 99.994552
4.0 10 0.004190 99.998743
5.0 2 0.000838 99.999581
7.0 1 0.000419 100.000000
# Drop "acc_now_delinq", "chargeoff_within_12_mths" are quasi-constants, will be dropped as well
dataset = dataset.drop(columns=["acc_now_delinq","chargeoff_within_12_mths"])
# Also remove it from the list of discrete variables
discrete.remove('acc_now_delinq')
discrete.remove('chargeoff_within_12_mths')
for col in discrete:
print(f"Missing values for column: {col}")
print(dataset[col].isnull().mean())
print("\n")
Missing values for column: issue_d 0.0 Missing values for column: inq_last_6mths 0.0 Missing values for column: pub_rec 0.0 Missing values for column: open_acc_6m 0.0 Missing values for column: open_il_12m 0.0 Missing values for column: pub_rec_bankruptcies 0.0
for col in continuous:
print(f"Missing values for column: {col}")
print(dataset[col].isnull().mean())
print("\n")
Missing values for column: loan_amnt 0.0 Missing values for column: funded_amnt 0.0 Missing values for column: installment 0.0 Missing values for column: annual_inc 0.0 Missing values for column: dti 0.002464003754672388 Missing values for column: delinq_2yrs 0.0 Missing values for column: fico_range_low 0.0 Missing values for column: fico_range_high 0.0 Missing values for column: mths_since_last_delinq 0.557614944937059 Missing values for column: mths_since_last_record 0.866629511054493 Missing values for column: open_acc 0.0 Missing values for column: revol_bal 0.0 Missing values for column: revol_util 0.0012864781508238488 Missing values for column: total_acc 0.0 Missing values for column: total_pymnt 0.0 Missing values for column: total_rec_prncp 0.0 Missing values for column: recoveries 0.0 Missing values for column: last_fico_range_high 0.0 Missing values for column: last_fico_range_low 0.0 Missing values for column: mths_since_last_major_derog 0.7660746911614341 Missing values for column: annual_inc_joint 0.8557719707001459 Missing values for column: dti_joint 0.8557719707001459 Missing values for column: tot_coll_amt 0.0 Missing values for column: tot_cur_bal 0.0 Missing values for column: open_act_il 0.0 Missing values for column: open_il_24m 0.0 Missing values for column: total_bal_il 0.0 Missing values for column: il_util 0.167770160411673 Missing values for column: open_rv_12m 0.0 Missing values for column: open_rv_24m 0.0 Missing values for column: total_rev_hi_lim 0.0 Missing values for column: total_cu_tl 0.0 Missing values for column: inq_last_12m 0.0 Missing values for column: acc_open_past_24mths 0.0 Missing values for column: avg_cur_bal 6.285723863960173e-05 Missing values for column: delinq_amnt 0.0 Missing values for column: mort_acc 0.0 Missing values for column: mths_since_recent_bc_dlq 0.8008850299200456 Missing values for column: mths_since_recent_inq 0.1225380914866156 Missing values for column: mths_since_recent_revol_delinq 0.7115816557434754 Missing values for column: num_accts_ever_120_pd 0.0 Missing values for column: num_rev_accts 0.0 Missing values for column: total_bal_ex_mort 0.0 Missing values for column: revol_bal_joint 0.8557719707001459
dataset.shape
(238636, 60)
# No duplicate record was identified -movign on
dataset = dataset.drop_duplicates()
dataset.shape
(238636, 60)
Before proceeding with any statistical evaluation or preparatory processing of the data, it is essential to define the Development (training) and Performance Validation (test) sets, in order to prevent the possibility of data leakage into the PV data. We do not want -under any circumstances- to proceed with any preparatory action that would make the algorithm perform better by using information from the entire data set.
Hence:
for_training = dataset[(dataset.issue_d >= '2018-01-01 00:00:00') & (dataset.issue_d < '2018-04-01 00:00:00')]
for_test = dataset[(dataset.issue_d >= '2018-04-01 00:00:00') & (dataset.issue_d < '2018-05-01 00:00:00')]
print(for_training.shape)
for_training.head()
(107864, 60)
| loan_amnt | funded_amnt | term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | total_rec_prncp | recoveries | last_fico_range_high | last_fico_range_low | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | total_bal_il | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_rev_accts | pub_rec_bankruptcies | total_bal_ex_mort | revol_bal_joint | Defaulted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5000.0 | 5000.0 | 36 months | 186.82 | 8 years | RENT | 50000.0 | Verified | 2018-03-01 | Current | other | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 669.0 | 0.0 | 9.0 | NaN | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 1219.69 | 0.0 | 609.0 | 605.0 | 9.0 | Individual | NaN | NaN | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 14118.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 0.0 | 0.0 | NaN | 2.0 | NaN | 4.0 | 2.0 | 0.0 | 19344.0 | NaN | 0 |
| 1 | 15000.0 | 15000.0 | 36 months | 483.45 | 2 years | OWN | 196000.0 | Source Verified | 2018-03-01 | Current | debt_consolidation | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 704.0 | 0.0 | 65.0 | NaN | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 4121.50 | 0.0 | 694.0 | 690.0 | NaN | Individual | NaN | NaN | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 113470.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 5.0 | NaN | 6.0 | NaN | 0.0 | 37.0 | 0.0 | 137713.0 | NaN | 0 |
| 2 | 11200.0 | 11200.0 | 60 months | 367.82 | < 1 year | RENT | 44000.0 | Not Verified | 2018-03-01 | Current | medical | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 669.0 | 2.0 | 6.0 | NaN | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 1006.27 | 0.0 | 629.0 | 625.0 | 70.0 | Joint App | 81000.0 | 31.94 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 65647.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 35.0 | 0.0 | 35.0 | 1.0 | 6.0 | 0.0 | 67173.0 | 7101.0 | 0 |
| 3 | 25000.0 | 25000.0 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.0 | Source Verified | 2018-03-01 | Current | debt_consolidation | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 669.0 | 1.0 | 22.0 | NaN | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 2811.27 | 0.0 | 669.0 | 665.0 | 23.0 | Individual | NaN | NaN | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 8382.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 2.0 | NaN | 0.0 | 22.0 | 2.0 | 9.0 | 0.0 | 17039.0 | NaN | 0 |
| 4 | 3000.0 | 3000.0 | 36 months | 93.10 | 9 years | RENT | 52000.0 | Source Verified | 2018-03-01 | Fully Paid | major_purchase | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 764.0 | 0.0 | 26.0 | NaN | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 3000.00 | 0.0 | 764.0 | 760.0 | NaN | Individual | NaN | NaN | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | NaN | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 4.0 | NaN | 7.0 | NaN | 0.0 | 19.0 | 0.0 | 141.0 | NaN | 0 |
print(for_test.shape)
for_test.head()
(42928, 60)
| loan_amnt | funded_amnt | term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | loan_status | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | total_rec_prncp | recoveries | last_fico_range_high | last_fico_range_low | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | total_bal_il | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_rev_accts | pub_rec_bankruptcies | total_bal_ex_mort | revol_bal_joint | Defaulted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 194739 | 9600.0 | 9600.0 | 36 months | 332.98 | 10+ years | MORTGAGE | 145000.0 | Verified | 2018-04-01 | Current | other | GA | 25.99 | 1.0 | Oct-1988 | 670.0 | 674.0 | 1.0 | 21.0 | NaN | 26.0 | 0.0 | 72673.0 | 83.3 | 35.0 | 3325.15 | 2250.64 | 0.0 | 669.0 | 665.0 | NaN | Individual | NaN | NaN | 0.0 | 449697.0 | 1.0 | 2.0 | 1.0 | 2.0 | 27401.0 | 88.0 | 0.0 | 0.0 | 87200.0 | 0.0 | 4.0 | 3.0 | 17296.0 | 0.0 | 3.0 | NaN | 3.0 | 21.0 | 0.0 | 28.0 | 0.0 | 100074.0 | NaN | 0 |
| 194743 | 12800.0 | 12800.0 | 60 months | 295.06 | 4 years | RENT | 40000.0 | Not Verified | 2018-04-01 | Current | debt_consolidation | MD | 38.28 | 0.0 | Jul-1991 | 725.0 | 729.0 | 0.0 | NaN | NaN | 24.0 | 0.0 | 18798.0 | 47.7 | 33.0 | 2940.94 | 1580.91 | 0.0 | 694.0 | 690.0 | NaN | Individual | NaN | NaN | 0.0 | 38625.0 | 0.0 | 2.0 | 2.0 | 2.0 | 19827.0 | 56.0 | 0.0 | 5.0 | 39400.0 | 2.0 | 0.0 | 7.0 | 1756.0 | 0.0 | 0.0 | NaN | 16.0 | NaN | 0.0 | 25.0 | 0.0 | 38625.0 | NaN | 0 |
| 194747 | 20000.0 | 20000.0 | 60 months | 486.47 | 10+ years | MORTGAGE | 70000.0 | Not Verified | 2018-04-01 | Current | debt_consolidation | NE | 3.60 | 0.0 | Jan-2006 | 695.0 | 699.0 | 1.0 | NaN | NaN | 5.0 | 0.0 | 8409.0 | 71.3 | 31.0 | 10242.59 | 8279.87 | 0.0 | 694.0 | 690.0 | NaN | Joint App | 140000.0 | 1.80 | 0.0 | 183164.0 | 2.0 | 1.0 | 1.0 | 3.0 | 2314.0 | 101.0 | 0.0 | 1.0 | 11800.0 | 0.0 | 3.0 | 5.0 | 36633.0 | 0.0 | 2.0 | NaN | 5.0 | NaN | 0.0 | 15.0 | 0.0 | 10723.0 | 8409.0 | 0 |
| 194749 | 24775.0 | 24775.0 | 60 months | 602.62 | 10+ years | MORTGAGE | 58000.0 | Not Verified | 2018-04-01 | Current | debt_consolidation | MI | 30.58 | 0.0 | Aug-2002 | 720.0 | 724.0 | 0.0 | NaN | NaN | 9.0 | 0.0 | 23769.0 | 70.3 | 24.0 | 6469.97 | 3199.93 | 0.0 | 714.0 | 710.0 | NaN | Joint App | 120000.0 | 21.36 | 0.0 | 294371.0 | 0.0 | 3.0 | 1.0 | 2.0 | 42667.0 | 83.0 | 0.0 | 0.0 | 33800.0 | 9.0 | 2.0 | 2.0 | 32708.0 | 0.0 | 2.0 | NaN | 8.0 | NaN | 0.0 | 8.0 | 0.0 | 66436.0 | 56210.0 | 0 |
| 194750 | 2000.0 | 2000.0 | 36 months | 64.46 | 3 years | RENT | 33600.0 | Source Verified | 2018-04-01 | Current | credit_card | NH | 11.57 | 0.0 | Dec-2014 | 700.0 | 704.0 | 2.0 | NaN | NaN | 5.0 | 0.0 | 2518.0 | 32.3 | 6.0 | 643.50 | 497.49 | 0.0 | 624.0 | 620.0 | NaN | Individual | NaN | NaN | 0.0 | 97754.0 | 1.0 | 1.0 | 0.0 | 0.0 | 5339.0 | 38.0 | 1.0 | 3.0 | 7800.0 | 2.0 | 11.0 | 4.0 | 19551.0 | 0.0 | 1.0 | NaN | 1.0 | NaN | 0.0 | 3.0 | 0.0 | 7857.0 | NaN | 0 |
X_train , y_train = for_training.drop(columns=['Defaulted', 'loan_status']), for_training['Defaulted']
X_train.shape, y_train.shape
((107864, 58), (107864,))
X_test , y_test = for_test.drop(columns=['Defaulted', 'loan_status']), for_test['Defaulted']
X_test.shape, y_test.shape
((42928, 58), (42928,))
X_train
| loan_amnt | funded_amnt | term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | total_rec_prncp | recoveries | last_fico_range_high | last_fico_range_low | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | total_bal_il | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_rev_accts | pub_rec_bankruptcies | total_bal_ex_mort | revol_bal_joint | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 5000.0 | 5000.0 | 36 months | 186.82 | 8 years | RENT | 50000.00 | Verified | 2018-03-01 | other | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 669.0 | 0.0 | 9.0 | NaN | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 1219.69 | 0.0 | 609.0 | 605.0 | 9.0 | Individual | NaN | NaN | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 14118.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 0.0 | 0.0 | NaN | 2.0 | NaN | 4.0 | 2.0 | 0.0 | 19344.0 | NaN |
| 1 | 15000.0 | 15000.0 | 36 months | 483.45 | 2 years | OWN | 196000.00 | Source Verified | 2018-03-01 | debt_consolidation | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 704.0 | 0.0 | 65.0 | NaN | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 4121.50 | 0.0 | 694.0 | 690.0 | NaN | Individual | NaN | NaN | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 113470.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 5.0 | NaN | 6.0 | NaN | 0.0 | 37.0 | 0.0 | 137713.0 | NaN |
| 2 | 11200.0 | 11200.0 | 60 months | 367.82 | < 1 year | RENT | 44000.00 | Not Verified | 2018-03-01 | medical | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 669.0 | 2.0 | 6.0 | NaN | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 1006.27 | 0.0 | 629.0 | 625.0 | 70.0 | Joint App | 81000.0 | 31.94 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 65647.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 35.0 | 0.0 | 35.0 | 1.0 | 6.0 | 0.0 | 67173.0 | 7101.0 |
| 3 | 25000.0 | 25000.0 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.00 | Source Verified | 2018-03-01 | debt_consolidation | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 669.0 | 1.0 | 22.0 | NaN | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 2811.27 | 0.0 | 669.0 | 665.0 | 23.0 | Individual | NaN | NaN | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 8382.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 2.0 | NaN | 0.0 | 22.0 | 2.0 | 9.0 | 0.0 | 17039.0 | NaN |
| 4 | 3000.0 | 3000.0 | 36 months | 93.10 | 9 years | RENT | 52000.00 | Source Verified | 2018-03-01 | major_purchase | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 764.0 | 0.0 | 26.0 | NaN | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 3000.00 | 0.0 | 764.0 | 760.0 | NaN | Individual | NaN | NaN | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | 0.0 | NaN | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 4.0 | NaN | 7.0 | NaN | 0.0 | 19.0 | 0.0 | 141.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 12000.0 | 12000.0 | 60 months | 270.71 | NaN | MORTGAGE | 89625.39 | Not Verified | 2018-01-01 | debt_consolidation | CA | 17.61 | 0.0 | Jan-1982 | 660.0 | 664.0 | 0.0 | 40.0 | NaN | 10.0 | 0.0 | 18601.0 | 90.3 | 37.0 | 12483.154233 | 12000.00 | 0.0 | 689.0 | 685.0 | 40.0 | Individual | NaN | NaN | 0.0 | 473894.0 | 1.0 | 2.0 | 2.0 | 2.0 | 25536.0 | 88.0 | 1.0 | 1.0 | 20600.0 | 0.0 | 0.0 | 3.0 | 47389.0 | 0.0 | 1.0 | 40.0 | NaN | 40.0 | 16.0 | 27.0 | 0.0 | 44374.0 | NaN |
| 107860 | 4375.0 | 4375.0 | 36 months | 149.70 | 10+ years | MORTGAGE | 52000.00 | Not Verified | 2018-01-01 | home_improvement | IL | 33.72 | 0.0 | Feb-1994 | 690.0 | 694.0 | 0.0 | NaN | NaN | 22.0 | 0.0 | 28116.0 | 49.2 | 41.0 | 2092.380000 | 1487.25 | 0.0 | 684.0 | 680.0 | NaN | Individual | NaN | NaN | 249.0 | 217780.0 | 1.0 | 2.0 | 0.0 | 2.0 | 22184.0 | 66.0 | 1.0 | 3.0 | 57200.0 | 0.0 | 0.0 | 5.0 | 9899.0 | 0.0 | 3.0 | NaN | 17.0 | NaN | 0.0 | 34.0 | 0.0 | 50300.0 | NaN |
| 107861 | 6000.0 | 6000.0 | 36 months | 196.18 | 10+ years | MORTGAGE | 50000.00 | Source Verified | 2018-01-01 | debt_consolidation | NH | 28.93 | 0.0 | Jun-1997 | 690.0 | 694.0 | 0.0 | 58.0 | NaN | 11.0 | 0.0 | 6950.0 | 51.9 | 14.0 | 2742.880000 | 2104.37 | 0.0 | 819.0 | 815.0 | 58.0 | Individual | NaN | NaN | 0.0 | 230614.0 | 0.0 | 1.0 | 1.0 | 1.0 | 18497.0 | 83.0 | 1.0 | 1.0 | 13400.0 | 0.0 | 2.0 | 2.0 | 20965.0 | 0.0 | 2.0 | 58.0 | 7.0 | 58.0 | 1.0 | 11.0 | 0.0 | 25447.0 | NaN |
| 107862 | 12000.0 | 12000.0 | 36 months | 389.58 | 8 years | MORTGAGE | 36000.00 | Verified | 2018-01-01 | debt_consolidation | IN | 11.10 | 1.0 | May-1998 | 685.0 | 689.0 | 0.0 | 21.0 | NaN | 14.0 | 0.0 | 11648.0 | 43.6 | 18.0 | 5593.050000 | 4383.16 | 0.0 | 694.0 | 690.0 | 21.0 | Individual | NaN | NaN | 0.0 | 191131.0 | 2.0 | 1.0 | 0.0 | 0.0 | 105786.0 | NaN | 3.0 | 6.0 | 26700.0 | 0.0 | 1.0 | 6.0 | 14702.0 | 0.0 | 1.0 | NaN | 11.0 | NaN | 1.0 | 12.0 | 0.0 | 117434.0 | NaN |
| 107863 | 14000.0 | 14000.0 | 36 months | 475.71 | 2 years | OWN | 80000.00 | Source Verified | 2018-01-01 | car | CA | 1.35 | 0.0 | Jul-2007 | 660.0 | 664.0 | 1.0 | 31.0 | NaN | 11.0 | 0.0 | 1461.0 | 4.1 | 21.0 | 14662.947011 | 14000.00 | 0.0 | 674.0 | 670.0 | 31.0 | Individual | NaN | NaN | 0.0 | 1461.0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | NaN | 1.0 | 2.0 | 35300.0 | 0.0 | 2.0 | 3.0 | 162.0 | 0.0 | 0.0 | 31.0 | 0.0 | 31.0 | 5.0 | 19.0 | 0.0 | 1461.0 | NaN |
107864 rows × 58 columns
X_test
| loan_amnt | funded_amnt | term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | fico_range_high | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | total_rec_prncp | recoveries | last_fico_range_high | last_fico_range_low | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | total_bal_il | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | mths_since_recent_revol_delinq | num_accts_ever_120_pd | num_rev_accts | pub_rec_bankruptcies | total_bal_ex_mort | revol_bal_joint | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 194739 | 9600.0 | 9600.0 | 36 months | 332.98 | 10+ years | MORTGAGE | 145000.0 | Verified | 2018-04-01 | other | GA | 25.99 | 1.0 | Oct-1988 | 670.0 | 674.0 | 1.0 | 21.0 | NaN | 26.0 | 0.0 | 72673.0 | 83.3 | 35.0 | 3325.15 | 2250.64 | 0.0 | 669.0 | 665.0 | NaN | Individual | NaN | NaN | 0.0 | 449697.0 | 1.0 | 2.0 | 1.0 | 2.0 | 27401.0 | 88.0 | 0.0 | 0.0 | 87200.0 | 0.0 | 4.0 | 3.0 | 17296.0 | 0.0 | 3.0 | NaN | 3.0 | 21.0 | 0.0 | 28.0 | 0.0 | 100074.0 | NaN |
| 194743 | 12800.0 | 12800.0 | 60 months | 295.06 | 4 years | RENT | 40000.0 | Not Verified | 2018-04-01 | debt_consolidation | MD | 38.28 | 0.0 | Jul-1991 | 725.0 | 729.0 | 0.0 | NaN | NaN | 24.0 | 0.0 | 18798.0 | 47.7 | 33.0 | 2940.94 | 1580.91 | 0.0 | 694.0 | 690.0 | NaN | Individual | NaN | NaN | 0.0 | 38625.0 | 0.0 | 2.0 | 2.0 | 2.0 | 19827.0 | 56.0 | 0.0 | 5.0 | 39400.0 | 2.0 | 0.0 | 7.0 | 1756.0 | 0.0 | 0.0 | NaN | 16.0 | NaN | 0.0 | 25.0 | 0.0 | 38625.0 | NaN |
| 194747 | 20000.0 | 20000.0 | 60 months | 486.47 | 10+ years | MORTGAGE | 70000.0 | Not Verified | 2018-04-01 | debt_consolidation | NE | 3.60 | 0.0 | Jan-2006 | 695.0 | 699.0 | 1.0 | NaN | NaN | 5.0 | 0.0 | 8409.0 | 71.3 | 31.0 | 10242.59 | 8279.87 | 0.0 | 694.0 | 690.0 | NaN | Joint App | 140000.0 | 1.80 | 0.0 | 183164.0 | 2.0 | 1.0 | 1.0 | 3.0 | 2314.0 | 101.0 | 0.0 | 1.0 | 11800.0 | 0.0 | 3.0 | 5.0 | 36633.0 | 0.0 | 2.0 | NaN | 5.0 | NaN | 0.0 | 15.0 | 0.0 | 10723.0 | 8409.0 |
| 194749 | 24775.0 | 24775.0 | 60 months | 602.62 | 10+ years | MORTGAGE | 58000.0 | Not Verified | 2018-04-01 | debt_consolidation | MI | 30.58 | 0.0 | Aug-2002 | 720.0 | 724.0 | 0.0 | NaN | NaN | 9.0 | 0.0 | 23769.0 | 70.3 | 24.0 | 6469.97 | 3199.93 | 0.0 | 714.0 | 710.0 | NaN | Joint App | 120000.0 | 21.36 | 0.0 | 294371.0 | 0.0 | 3.0 | 1.0 | 2.0 | 42667.0 | 83.0 | 0.0 | 0.0 | 33800.0 | 9.0 | 2.0 | 2.0 | 32708.0 | 0.0 | 2.0 | NaN | 8.0 | NaN | 0.0 | 8.0 | 0.0 | 66436.0 | 56210.0 |
| 194750 | 2000.0 | 2000.0 | 36 months | 64.46 | 3 years | RENT | 33600.0 | Source Verified | 2018-04-01 | credit_card | NH | 11.57 | 0.0 | Dec-2014 | 700.0 | 704.0 | 2.0 | NaN | NaN | 5.0 | 0.0 | 2518.0 | 32.3 | 6.0 | 643.50 | 497.49 | 0.0 | 624.0 | 620.0 | NaN | Individual | NaN | NaN | 0.0 | 97754.0 | 1.0 | 1.0 | 0.0 | 0.0 | 5339.0 | 38.0 | 1.0 | 3.0 | 7800.0 | 2.0 | 11.0 | 4.0 | 19551.0 | 0.0 | 1.0 | NaN | 1.0 | NaN | 0.0 | 3.0 | 0.0 | 7857.0 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 238631 | 12000.0 | 12000.0 | 36 months | 398.46 | NaN | MORTGAGE | 60000.0 | Not Verified | 2018-04-01 | major_purchase | NV | 28.64 | 0.0 | Mar-2005 | 720.0 | 724.0 | 0.0 | NaN | 102.0 | 16.0 | 1.0 | 12025.0 | 38.2 | 22.0 | 3570.17 | 2610.46 | 0.0 | 739.0 | 735.0 | NaN | Individual | NaN | NaN | 0.0 | 230207.0 | 0.0 | 2.0 | 0.0 | 0.0 | 28116.0 | 34.0 | 5.0 | 9.0 | 31500.0 | 0.0 | 2.0 | 9.0 | 15347.0 | 0.0 | 1.0 | NaN | 7.0 | NaN | 0.0 | 16.0 | 1.0 | 40141.0 | NaN |
| 238632 | 3000.0 | 3000.0 | 36 months | 112.09 | 10+ years | RENT | 72000.0 | Not Verified | 2018-04-01 | other | NJ | 5.87 | 0.0 | Oct-2007 | 660.0 | 664.0 | 0.0 | NaN | NaN | 11.0 | 0.0 | 5509.0 | 60.5 | 13.0 | 1117.50 | 659.82 | 0.0 | 499.0 | 0.0 | NaN | Individual | NaN | NaN | 0.0 | 5509.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 3.0 | 9.0 | 9100.0 | 0.0 | 1.0 | 9.0 | 501.0 | 0.0 | 0.0 | NaN | 8.0 | NaN | 0.0 | 13.0 | 0.0 | 5509.0 | NaN |
| 238633 | 14000.0 | 14000.0 | 36 months | 492.27 | 10+ years | MORTGAGE | 52874.0 | Not Verified | 2018-04-01 | debt_consolidation | CA | 22.36 | 0.0 | Sep-1995 | 660.0 | 664.0 | 1.0 | NaN | 87.0 | 9.0 | 1.0 | 25426.0 | 85.0 | 20.0 | 5402.52 | 3593.73 | 0.0 | 679.0 | 675.0 | NaN | Individual | NaN | NaN | 0.0 | 32056.0 | 2.0 | 1.0 | 1.0 | 1.0 | 6630.0 | 88.0 | 1.0 | 2.0 | 29900.0 | 1.0 | 1.0 | 3.0 | 3561.0 | 0.0 | 0.0 | NaN | 6.0 | NaN | 0.0 | 19.0 | 1.0 | 32056.0 | NaN |
| 238634 | 7500.0 | 7500.0 | 36 months | 245.19 | 10+ years | RENT | 126000.0 | Not Verified | 2018-04-01 | other | NY | 6.33 | 0.0 | Dec-1986 | 740.0 | 744.0 | 1.0 | NaN | NaN | 16.0 | 0.0 | 11122.0 | 28.3 | 31.0 | 2688.01 | 2038.63 | 0.0 | 709.0 | 705.0 | NaN | Individual | NaN | NaN | 0.0 | 132066.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 5.0 | 8.0 | 39300.0 | 0.0 | 2.0 | 11.0 | 8254.0 | 0.0 | 3.0 | NaN | 1.0 | NaN | 0.0 | 24.0 | 0.0 | 11122.0 | NaN |
| 238635 | 35000.0 | 35000.0 | 36 months | 1065.88 | 10+ years | RENT | 110000.0 | Source Verified | 2018-04-01 | debt_consolidation | FL | 9.80 | 0.0 | Jul-1995 | 715.0 | 719.0 | 0.0 | NaN | NaN | 13.0 | 0.0 | 39634.0 | 35.2 | 21.0 | 11712.88 | 10028.29 | 0.0 | 774.0 | 770.0 | NaN | Individual | NaN | NaN | 0.0 | 39634.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0 | NaN | 2.0 | 3.0 | 112500.0 | 0.0 | 0.0 | 3.0 | 3049.0 | 0.0 | 2.0 | NaN | NaN | NaN | 0.0 | 14.0 | 0.0 | 39634.0 | NaN |
42928 rows × 58 columns
continuous_discr = continuous.copy()
continuous_discr.extend(discrete)
# continuous_discr.remove('Defaulted')
# Computation of the correlation matrix
corr_matrix = X_train[continuous_discr].corr()
# Selecting the upper triangle of the correlation matrix
# Thus avoidign duplicate pairs (i.e. (A, B) and (B, A))
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
# Find pairs with an absolute correlation coefficient ≥ 0.85
high_corr_pairs = upper_triangle.stack().reset_index()
high_corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
high_corr_pairs = high_corr_pairs[high_corr_pairs['Correlation'].abs() >= 0.85]
print(high_corr_pairs)
Feature 1 Feature 2 Correlation 0 loan_amnt funded_amnt 1.000000 1 loan_amnt installment 0.944957 49 funded_amnt installment 0.944957 279 fico_range_low fico_range_high 1.000000 394 mths_since_last_delinq mths_since_recent_revol_delinq 0.860612 595 total_pymnt total_rec_prncp 0.975128 697 last_fico_range_high last_fico_range_low 0.876065 964 total_bal_il total_bal_ex_mort 0.907411 1148 mths_since_recent_bc_dlq mths_since_recent_revol_delinq 0.893902
# Will drop one for each pair respectively (hence will keep one per pair of correlated predictors).
# Will also have in mind the conceptual soundess of the analysis that will follow
drop_correlated_cols = [
"funded_amnt",
"loan_amnt",
"fico_range_high",
"mths_since_recent_revol_delinq",
"pub_rec_bankruptcies",
"total_rec_prncp",
"last_fico_range_low",
"total_bal_il",
"total_bal_ex_mort"
]
# Drop the columns from the train and test datasets
X_train = X_train.drop(columns=drop_correlated_cols)
X_test = X_test.drop(columns=drop_correlated_cols)
# confirm
print(X_train.shape)
print(X_test.shape)
(107864, 49) (42928, 49)
# For categorical variables
categorical = [var for var in X_train.columns if X_train[var].dtype == 'O']
# For discrete variables
discrete = [var for var in X_train.columns if X_train[var].dtype != 'O'
and X_train[var].nunique() < 20
]
# For continuous variables
continuous = [var for var in X_train.columns if X_train[var].dtype != 'O'
and var not in discrete
]
print('Categorical Variables: ', categorical)
print('Discrete Variables: ', discrete)
print('Continuous Variables: ', continuous)
Categorical Variables: ['term', 'emp_length', 'home_ownership', 'verification_status', 'purpose', 'addr_state', 'earliest_cr_line', 'application_type'] Discrete Variables: ['issue_d', 'inq_last_6mths', 'pub_rec', 'open_acc_6m', 'open_il_12m', 'open_il_24m'] Continuous Variables: ['installment', 'annual_inc', 'dti', 'delinq_2yrs', 'fico_range_low', 'mths_since_last_delinq', 'mths_since_last_record', 'open_acc', 'revol_bal', 'revol_util', 'total_acc', 'total_pymnt', 'recoveries', 'last_fico_range_high', 'mths_since_last_major_derog', 'annual_inc_joint', 'dti_joint', 'tot_coll_amt', 'tot_cur_bal', 'open_act_il', 'il_util', 'open_rv_12m', 'open_rv_24m', 'total_rev_hi_lim', 'total_cu_tl', 'inq_last_12m', 'acc_open_past_24mths', 'avg_cur_bal', 'delinq_amnt', 'mort_acc', 'mths_since_recent_bc_dlq', 'mths_since_recent_inq', 'num_accts_ever_120_pd', 'num_rev_accts', 'revol_bal_joint']
# Remove also the "issue_d" from the list of discrete vars, since it does not provide any discriminatory power
# (and if it would -it wouldnt be conceptually sound)
# Drop the columns from the train and test datasets
discrete.remove('issue_d')
X_train.describe()
| installment | annual_inc | issue_d | dti | delinq_2yrs | fico_range_low | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | recoveries | last_fico_range_high | mths_since_last_major_derog | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | num_accts_ever_120_pd | num_rev_accts | revol_bal_joint | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 107864.000000 | 1.078640e+05 | 107864 | 107602.000000 | 107864.000000 | 107864.000000 | 107864.000000 | 47169.000000 | 15269.000000 | 107864.000000 | 107864.000000 | 107864.000000 | 107715.000000 | 107864.000000 | 107864.000000 | 107864.000000 | 107864.000000 | 24761.000000 | 1.633100e+04 | 16331.000000 | 1.078640e+05 | 1.078640e+05 | 107864.000000 | 107864.000000 | 107864.000000 | 107864.000000 | 89880.000000 | 107864.000000 | 107864.000000 | 1.078640e+05 | 107864.000000 | 107864.000000 | 107864.000000 | 107854.000000 | 107864.000000 | 107864.000000 | 21298.000000 | 94216.000000 | 107864.000000 | 107864.000000 | 16331.000000 |
| mean | 469.646694 | 7.854227e+04 | 2018-01-31 14:50:23.273752064 | 19.648209 | 0.223773 | 708.248303 | 0.461266 | 36.980008 | 79.156657 | 11.377151 | 0.163243 | 16093.382880 | 43.372762 | 22.618019 | 7694.295719 | 31.738431 | 705.486956 | 46.193167 | 1.255066e+05 | 19.661708 | 2.932763e+02 | 1.437632e+05 | 0.904936 | 2.655613 | 0.648252 | 1.494975 | 67.606019 | 1.221529 | 2.601331 | 3.835303e+04 | 1.458457 | 1.954786 | 4.376817 | 13884.263523 | 3.559640 | 1.392967 | 40.445723 | 7.390942 | 0.449242 | 12.970871 | 34531.574919 |
| min | 29.760000 | 0.000000e+00 | 2018-01-01 00:00:00 | 0.000000 | 0.000000 | 660.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.800000e+04 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 |
| 25% | 254.560000 | 4.500000e+04 | 2018-01-01 00:00:00 | 11.230000 | 0.000000 | 680.000000 | 0.000000 | 19.000000 | 64.000000 | 7.000000 | 0.000000 | 5170.000000 | 23.500000 | 14.000000 | 3497.757500 | 0.000000 | 674.000000 | 30.000000 | 8.500000e+04 | 13.730000 | 0.000000e+00 | 2.649400e+04 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 53.000000 | 0.000000 | 1.000000 | 1.600000e+04 | 0.000000 | 0.000000 | 2.000000 | 2925.000000 | 0.000000 | 0.000000 | 23.000000 | 2.000000 | 0.000000 | 7.000000 | 15760.500000 |
| 50% | 389.360000 | 6.500000e+04 | 2018-02-01 00:00:00 | 17.670000 | 0.000000 | 700.000000 | 0.000000 | 34.000000 | 83.000000 | 10.000000 | 0.000000 | 10657.000000 | 41.500000 | 21.000000 | 5744.400000 | 0.000000 | 709.000000 | 47.000000 | 1.120000e+05 | 19.200000 | 0.000000e+00 | 7.513550e+04 | 1.000000 | 2.000000 | 0.000000 | 1.000000 | 70.000000 | 1.000000 | 2.000000 | 2.850000e+04 | 0.000000 | 1.000000 | 4.000000 | 7169.500000 | 0.000000 | 1.000000 | 38.000000 | 6.000000 | 0.000000 | 11.000000 | 27652.000000 |
| 75% | 637.647500 | 9.500000e+04 | 2018-03-01 00:00:00 | 25.020000 | 0.000000 | 730.000000 | 1.000000 | 53.000000 | 98.000000 | 14.000000 | 0.000000 | 19511.250000 | 61.800000 | 29.000000 | 9696.480000 | 0.000000 | 744.000000 | 63.000000 | 1.500000e+05 | 25.200000 | 0.000000e+00 | 2.173230e+05 | 1.000000 | 3.000000 | 1.000000 | 2.000000 | 84.000000 | 2.000000 | 4.000000 | 4.890000e+04 | 2.000000 | 3.000000 | 6.000000 | 19215.000000 | 0.000000 | 2.000000 | 57.000000 | 11.000000 | 0.000000 | 17.000000 | 45080.500000 |
| max | 1618.030000 | 8.365188e+06 | 2018-03-01 00:00:00 | 999.000000 | 20.000000 | 845.000000 | 5.000000 | 226.000000 | 124.000000 | 69.000000 | 52.000000 | 925589.000000 | 191.000000 | 153.000000 | 51653.389338 | 21251.790000 | 850.000000 | 226.000000 | 1.187000e+06 | 39.980000 | 6.214661e+06 | 5.752177e+06 | 12.000000 | 41.000000 | 8.000000 | 20.000000 | 1000.000000 | 19.000000 | 38.000000 | 1.123500e+06 | 52.000000 | 46.000000 | 38.000000 | 620531.000000 | 65000.000000 | 46.000000 | 194.000000 | 24.000000 | 37.000000 | 151.000000 | 371153.000000 |
| std | 289.215801 | 7.687436e+04 | NaN | 21.795902 | 0.730417 | 37.205453 | 0.746153 | 21.802486 | 24.817082 | 5.879388 | 0.494799 | 22176.260283 | 25.149181 | 12.027177 | 6738.872144 | 334.216920 | 59.401365 | 21.555390 | 6.681665e+04 | 8.038172 | 2.481319e+04 | 1.676133e+05 | 1.122125 | 2.930210 | 0.912269 | 1.531529 | 24.141432 | 1.474057 | 2.527823 | 3.671479e+04 | 2.632854 | 2.372746 | 3.191143 | 17487.395429 | 324.829448 | 1.751895 | 22.257444 | 6.014327 | 1.335028 | 7.885984 | 28087.749846 |
Diagnostic plots for the Discrete variables
for col in discrete:
plt.figure(figsize=(15,4))
# First plot: Distribution plot
plt.subplot(121) # Change to 121 to arrange plots side by side
sns.histplot(X_train[col], kde=True, label='skew' + str(np.round(X_train[col].skew(), 2)))
plt.legend()
# Second plot: Box plot
plt.subplot(122) # Change to 122 to arrange plots side by side
sns.boxplot(x=X_train[col])
plt.tight_layout()
plt.show()
Diagnostic plots for the Continuous variables
for col in continuous:
plt.figure(figsize=(15,4))
# First plot: Distribution plot
plt.subplot(121) # Change to 121 to arrange plots side by side
sns.histplot(X_train[col], kde=True, label='skew' + str(np.round(X_train[col].skew(), 2)))
plt.legend()
# Second plot: Box plot
plt.subplot(122) # Change to 122 to arrange plots side by side
sns.boxplot(x=X_train[col])
plt.tight_layout()
plt.show()
X_train_for_segm = X_train.reset_index(drop=True)
y_train_for_segm = y_train.reset_index(drop=True)
# Merge the DataFrames using the index
for_segmentation = X_train_for_segm.merge(y_train_for_segm, left_index=True, right_index=True)
for_segmentation.head()
| term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | recoveries | last_fico_range_high | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | num_accts_ever_120_pd | num_rev_accts | revol_bal_joint | Defaulted | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | 186.82 | 8 years | RENT | 50000.0 | Verified | 2018-03-01 | other | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 0.0 | 9.0 | NaN | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 0.0 | 609.0 | 9.0 | Individual | NaN | NaN | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 0.0 | 0.0 | NaN | 2.0 | 4.0 | 2.0 | NaN | 0 |
| 1 | 36 months | 483.45 | 2 years | OWN | 196000.0 | Source Verified | 2018-03-01 | debt_consolidation | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 0.0 | 65.0 | NaN | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 0.0 | 694.0 | NaN | Individual | NaN | NaN | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 5.0 | NaN | 6.0 | 0.0 | 37.0 | NaN | 0 |
| 2 | 60 months | 367.82 | < 1 year | RENT | 44000.0 | Not Verified | 2018-03-01 | medical | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 2.0 | 6.0 | NaN | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 0.0 | 629.0 | 70.0 | Joint App | 81000.0 | 31.94 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 35.0 | 0.0 | 1.0 | 6.0 | 7101.0 | 0 |
| 3 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.0 | Source Verified | 2018-03-01 | debt_consolidation | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 1.0 | 22.0 | NaN | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 0.0 | 669.0 | 23.0 | Individual | NaN | NaN | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 2.0 | NaN | 0.0 | 2.0 | 9.0 | NaN | 0 |
| 4 | 36 months | 93.10 | 9 years | RENT | 52000.0 | Source Verified | 2018-03-01 | major_purchase | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 0.0 | 26.0 | NaN | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 0.0 | 764.0 | NaN | Individual | NaN | NaN | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | NaN | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 4.0 | NaN | 7.0 | 0.0 | 19.0 | NaN | 0 |
# Fill missing values accordingly
for_segmentation[categorical] = for_segmentation[categorical].fillna('MISSING')
for_segmentation[discrete] = for_segmentation[discrete].fillna(-1)
for_segmentation[continuous] = for_segmentation[continuous].fillna(-1)
for_segmentation['dti_rounded'] = np.round(for_segmentation['dti'],0).astype(int)
for_segmentation
| term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | recoveries | last_fico_range_high | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | num_accts_ever_120_pd | num_rev_accts | revol_bal_joint | Defaulted | dti_rounded | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | 186.82 | 8 years | RENT | 50000.00 | Verified | 2018-03-01 | other | OK | 21.80 | 1.0 | Jan-2009 | 665.0 | 0.0 | 9.0 | -1.0 | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 0.0 | 609.0 | 9.0 | Individual | -1.0 | -1.00 | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 0.0 | 0.0 | -1.0 | 2.0 | 4.0 | 2.0 | -1.0 | 0 | 22 |
| 1 | 36 months | 483.45 | 2 years | OWN | 196000.00 | Source Verified | 2018-03-01 | debt_consolidation | FL | 18.29 | 0.0 | Jul-1998 | 700.0 | 0.0 | 65.0 | -1.0 | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 0.0 | 694.0 | -1.0 | Individual | -1.0 | -1.00 | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 5.0 | -1.0 | 6.0 | 0.0 | 37.0 | -1.0 | 0 | 18 |
| 2 | 60 months | 367.82 | < 1 year | RENT | 44000.00 | Not Verified | 2018-03-01 | medical | NH | 43.97 | 1.0 | Jul-2007 | 665.0 | 2.0 | 6.0 | -1.0 | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 0.0 | 629.0 | 70.0 | Joint App | 81000.0 | 31.94 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 35.0 | 0.0 | 1.0 | 6.0 | 7101.0 | 0 | 44 |
| 3 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.00 | Source Verified | 2018-03-01 | debt_consolidation | AL | 12.89 | 1.0 | Mar-1995 | 665.0 | 1.0 | 22.0 | -1.0 | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 0.0 | 669.0 | 23.0 | Individual | -1.0 | -1.00 | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 2.0 | -1.0 | 0.0 | 2.0 | 9.0 | -1.0 | 0 | 13 |
| 4 | 36 months | 93.10 | 9 years | RENT | 52000.00 | Source Verified | 2018-03-01 | major_purchase | WA | 0.58 | 0.0 | Jan-1998 | 760.0 | 0.0 | 26.0 | -1.0 | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 0.0 | 764.0 | -1.0 | Individual | -1.0 | -1.00 | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | -1.0 | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 4.0 | -1.0 | 7.0 | 0.0 | 19.0 | -1.0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 60 months | 270.71 | MISSING | MORTGAGE | 89625.39 | Not Verified | 2018-01-01 | debt_consolidation | CA | 17.61 | 0.0 | Jan-1982 | 660.0 | 0.0 | 40.0 | -1.0 | 10.0 | 0.0 | 18601.0 | 90.3 | 37.0 | 12483.154233 | 0.0 | 689.0 | 40.0 | Individual | -1.0 | -1.00 | 0.0 | 473894.0 | 1.0 | 2.0 | 2.0 | 2.0 | 88.0 | 1.0 | 1.0 | 20600.0 | 0.0 | 0.0 | 3.0 | 47389.0 | 0.0 | 1.0 | 40.0 | -1.0 | 16.0 | 27.0 | -1.0 | 0 | 18 |
| 107860 | 36 months | 149.70 | 10+ years | MORTGAGE | 52000.00 | Not Verified | 2018-01-01 | home_improvement | IL | 33.72 | 0.0 | Feb-1994 | 690.0 | 0.0 | -1.0 | -1.0 | 22.0 | 0.0 | 28116.0 | 49.2 | 41.0 | 2092.380000 | 0.0 | 684.0 | -1.0 | Individual | -1.0 | -1.00 | 249.0 | 217780.0 | 1.0 | 2.0 | 0.0 | 2.0 | 66.0 | 1.0 | 3.0 | 57200.0 | 0.0 | 0.0 | 5.0 | 9899.0 | 0.0 | 3.0 | -1.0 | 17.0 | 0.0 | 34.0 | -1.0 | 0 | 34 |
| 107861 | 36 months | 196.18 | 10+ years | MORTGAGE | 50000.00 | Source Verified | 2018-01-01 | debt_consolidation | NH | 28.93 | 0.0 | Jun-1997 | 690.0 | 0.0 | 58.0 | -1.0 | 11.0 | 0.0 | 6950.0 | 51.9 | 14.0 | 2742.880000 | 0.0 | 819.0 | 58.0 | Individual | -1.0 | -1.00 | 0.0 | 230614.0 | 0.0 | 1.0 | 1.0 | 1.0 | 83.0 | 1.0 | 1.0 | 13400.0 | 0.0 | 2.0 | 2.0 | 20965.0 | 0.0 | 2.0 | 58.0 | 7.0 | 1.0 | 11.0 | -1.0 | 0 | 29 |
| 107862 | 36 months | 389.58 | 8 years | MORTGAGE | 36000.00 | Verified | 2018-01-01 | debt_consolidation | IN | 11.10 | 1.0 | May-1998 | 685.0 | 0.0 | 21.0 | -1.0 | 14.0 | 0.0 | 11648.0 | 43.6 | 18.0 | 5593.050000 | 0.0 | 694.0 | 21.0 | Individual | -1.0 | -1.00 | 0.0 | 191131.0 | 2.0 | 1.0 | 0.0 | 0.0 | -1.0 | 3.0 | 6.0 | 26700.0 | 0.0 | 1.0 | 6.0 | 14702.0 | 0.0 | 1.0 | -1.0 | 11.0 | 1.0 | 12.0 | -1.0 | 0 | 11 |
| 107863 | 36 months | 475.71 | 2 years | OWN | 80000.00 | Source Verified | 2018-01-01 | car | CA | 1.35 | 0.0 | Jul-2007 | 660.0 | 1.0 | 31.0 | -1.0 | 11.0 | 0.0 | 1461.0 | 4.1 | 21.0 | 14662.947011 | 0.0 | 674.0 | 31.0 | Individual | -1.0 | -1.00 | 0.0 | 1461.0 | 1.0 | 0.0 | 0.0 | 1.0 | -1.0 | 1.0 | 2.0 | 35300.0 | 0.0 | 2.0 | 3.0 | 162.0 | 0.0 | 0.0 | 31.0 | 0.0 | 5.0 | 19.0 | -1.0 | 0 | 1 |
107864 rows × 51 columns
for_segmentation['dti_rounded'] = np.round(for_segmentation['dti']).astype(int)
for_segmentation['instlmnt_round'] = np.round(for_segmentation['installment']).astype(int)
for_segmentation['Annual_Inc_round'] = np.round(for_segmentation['annual_inc']).astype(int)
for_segmentation['instlmnt_to_Annual_Inc'] = np.round(for_segmentation['instlmnt_round']/for_segmentation['Annual_Inc_round'])
for_segmentation['revol_bal_round'] = np.round(for_segmentation['revol_bal']).astype(int)
for_segmentation['revol_util_round'] = np.round(for_segmentation['revol_util']).astype(int)
for_segmentation['total_pymnt_round'] = np.round(for_segmentation['total_pymnt']).astype(int)
for_segmentation['recoveries_round'] = np.round(for_segmentation['recoveries']).astype(int)
for_segmentation['avg_cur_bal_round'] = np.round(for_segmentation['avg_cur_bal']).astype(int)
# Convert the date_column to datetime format
for_segmentation['earliest_cr_line'] = pd.to_datetime(for_segmentation['earliest_cr_line'], format='%b-%Y')
# Calculate the difference in years with the year 2018 (applications received during 2018)
for_segmentation['years_with_Credit_line'] = 2018 - for_segmentation['earliest_cr_line'].dt.year
for_segmentation['years_with_Credit_line']
0 9
1 20
2 11
3 23
4 20
..
107859 36
107860 24
107861 21
107862 20
107863 11
Name: years_with_Credit_line, Length: 107864, dtype: int32
def analyze_column_and_export(for_segmentation, column_name, output_file):
# Ensure the necessary columns are available
if 'Defaulted' not in for_segmentation.columns:
raise ValueError("The dataset must contain a 'Defaulted' column.")
# Calculate "Percent of Total Frequency" for each unique value
total_count = for_segmentation[column_name].value_counts(normalize=True) * 100
# Calculate the "Frequency Count" of Goods (Defaulted = 0)
goods_count = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts()
# Calculate "Percent of Column Frequency" for the Goods
goods_percent = for_segmentation[for_segmentation['Defaulted'] == 0][column_name].value_counts(normalize=True) * 100
# Calculate the "Frequency Count" of the Bads (Defaulted = 1)
bads_count = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts()
# Calculate "Percent of Column Frequency" for the Bads
bads_percent = for_segmentation[for_segmentation['Defaulted'] == 1][column_name].value_counts(normalize=True) * 100
# Create a DataFrame with the calculated values
summary_df = pd.DataFrame({
column_name: total_count.index,
'Percent of Total Frequency': total_count.values,
'Frequency Count (Goods)': goods_count,
'Percent of Column Frequency (Goods)': goods_percent,
'Frequency Count (Bads)': bads_count,
'Percent of Column Frequency (Bads)': bads_percent,
}).fillna(0)
# Calculate the good-bad odds
summary_df['Good-Bad Odds'] = summary_df['Frequency Count (Goods)'] / summary_df['Frequency Count (Bads)']
summary_df['Good-Bad Odds'].replace([np.inf, -np.inf], 0, inplace=True)
# Calculate the bad rate and convert it to a percentage
summary_df['Bad Rate'] = (summary_df['Frequency Count (Bads)'] / (summary_df['Frequency Count (Goods)'] + summary_df['Frequency Count (Bads)'])) * 100
# Calculate the weight of evidence (WOE)
summary_df['WOE'] = np.log(np.where(summary_df['Percent of Column Frequency (Bads)'] == 0, np.nan, summary_df['Percent of Column Frequency (Goods)'] / summary_df['Percent of Column Frequency (Bads)']))
# Replace -inf, inf and NaN in WOE with 0
summary_df['WOE'].replace([np.inf, -np.inf], 0, inplace=True)
summary_df['WOE'].fillna(0, inplace=True)
# Calculate Information Value (IV)
summary_df['IV'] = (summary_df['Percent of Column Frequency (Goods)'] - summary_df['Percent of Column Frequency (Bads)']) * summary_df['WOE']
# Fillna with 0 in IV
summary_df['IV'].fillna(0, inplace=True)
# Add the 'grpchar' column as a copy of the input column
summary_df['grpchar'] = summary_df[column_name]
# Reset index to avoid ambiguity
summary_df.reset_index(drop=True, inplace=True)
# Sort the DataFrame based on the specified column in ascending order
summary_df.sort_values(by=column_name, inplace=True)
# Save the summary DataFrame to an Excel file
summary_df.to_excel(output_file, index=False)
# Analyze each column individualy
# Run for each below predictor independently -to obtain the "For_IV_Segm" excel file
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_recent_inq', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'total_acc', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'il_util', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'num_rev_accts', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mort_acc', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'application_type', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_rv_12m', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'acc_open_past_24mths', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_rv_24m', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_acc_6m', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'inq_last_12m', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'recoveries_round', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_il_12m', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'last_fico_range_high', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_major_derog', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'fico_range_low', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'revol_util', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'term', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'emp_length', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'dti_rounded', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'instlmnt_to_Annual_Inc', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'home_ownership', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'verification_status', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'purpose', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'years_with_Credit_line', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'inq_last_6mths', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_delinq', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'mths_since_last_record', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'open_acc', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'pub_rec', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'revol_bal', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'term', output_file)
output_file = 'D:/Data Analytics Tools/For_IV_Segm.xlsx'
analyze_column_and_export(for_segmentation, 'dti_rounded', output_file)
# Fill missing values accordingly
#####################################################################################
#####################################################################################
# TRAIN SET
#####################################################################################
#####################################################################################
X_train[categorical] = X_train[categorical].fillna('MISSING')
X_train[discrete] = X_train[discrete].fillna(-1)
X_train[continuous] = X_train[continuous].fillna(-1)
X_train['dti_rounded'] = np.round(X_train['dti']).astype(int)
X_train['instlmnt_round'] = np.round(X_train['installment']).astype(int)
X_train['Annual_Inc_round'] = np.round(X_train['annual_inc']).astype(int)
X_train['instlmnt_to_Annual_Inc'] = np.round(X_train['instlmnt_round']/X_train['Annual_Inc_round'])
X_train['revol_bal_round'] = np.round(X_train['revol_bal']).astype(int)
X_train['revol_util_round'] = np.round(X_train['revol_util']).astype(int)
# X_train['out_prncp_round'] = np.round(X_train['out_prncp']).astype(int)
X_train['total_pymnt_round'] = np.round(X_train['total_pymnt']).astype(int)
X_train['recoveries_round'] = np.round(X_train['recoveries']).astype(int)
X_train['avg_cur_bal_round'] = np.round(X_train['avg_cur_bal']).astype(int)
# Convert the date_column to datetime format
X_train['earliest_cr_line'] = pd.to_datetime(X_train['earliest_cr_line'], format='%b-%Y')
# Calculate the difference in years with the year 2018 (applications received during 2018)
X_train['years_with_Credit_line'] = 2018 - X_train['earliest_cr_line'].dt.year
X_train['years_with_Credit_line']
#####################################################################################
#####################################################################################
# TEST SET
#####################################################################################
#####################################################################################
X_test[categorical] = X_test[categorical].fillna('MISSING')
X_test[discrete] = X_test[discrete].fillna(-1)
X_test[continuous] = X_test[continuous].fillna(-1)
X_test['dti_rounded'] = np.round(X_test['dti'],0).astype(int)
X_test['dti_rounded'] = np.round(X_test['dti']).astype(int)
X_test['instlmnt_round'] = np.round(X_test['installment']).astype(int)
X_test['Annual_Inc_round'] = np.round(X_test['annual_inc']).astype(int)
X_test['instlmnt_to_Annual_Inc'] = np.round(X_test['instlmnt_round']/X_test['Annual_Inc_round'])
X_test['revol_bal_round'] = np.round(X_test['revol_bal']).astype(int)
X_test['revol_util_round'] = np.round(X_test['revol_util']).astype(int)
# X_test['out_prncp_round'] = np.round(X_test['out_prncp']).astype(int)
X_test['total_pymnt_round'] = np.round(X_test['total_pymnt']).astype(int)
X_test['recoveries_round'] = np.round(X_test['recoveries']).astype(int)
X_test['avg_cur_bal_round'] = np.round(X_test['avg_cur_bal']).astype(int)
# Convert the date_column to datetime format
X_test['earliest_cr_line'] = pd.to_datetime(X_test['earliest_cr_line'], format='%b-%Y')
# Calculate the difference in years with the year 2018 (applications received during 2018)
X_test['years_with_Credit_line'] = 2018 - X_test['earliest_cr_line'].dt.year
# Function to segment variables using if-else
def segment_variables(df):
# mths_since_recent_inq
df['mths_since_recent_inq_segm'] = df['mths_since_recent_inq'].apply(
lambda x: -1 if x == -1 else
1 if 0 <= x <= 2 else
2 if 3 <= x <= 10 else
3
).astype(int)
# delinq_2yrs
df['delinq_2yrs_segm'] = df['delinq_2yrs'].apply(
lambda x: 1 if x >= 3 else
2 if x == 2 else
3 if x == 1 else
4
).astype(int)
# fico_range_low
df['fico_range_low_segm'] = df['fico_range_low'].apply(
lambda x: 1 if x <= 680 else
2 if 685 <= x <= 700 else
3 if 705 <= x <= 720 else
4 if 725 <= x <= 790 else
5 )
# term
# term_map = {
# ' 60 months': 1,
# ' 36 months': 2
# }
# df['term_segm'] = df['term'].map(term_map).fillna(-1).astype(int)
df['term_segm'] = df['term'].apply(
lambda x: 1 if x ==' 60 months' else
2 ).fillna(-1).astype(int)
# emp_length
emp_length_map = {
'MISSING': -1,
'10+ years': 1,
'9 years': 1,
'1 year': 2,
'2 years': 2,
'3 years': 2,
'4 years': 2,
'5 years': 2,
'6 years': 2,
'7 years': 2,
'8 years': 2,
'< 1 year': 2
}
df['emp_length_segm'] = df['emp_length'].map(emp_length_map).fillna(-1).astype(int)
# dti_rounded
df['dti_rounded_segm'] = df['dti_rounded'].apply(
lambda x: -1 if x == -1 else
1 if 0 <= x <= 7 else
2
).astype(int)
# Inc_Rounded
df['Annual_Inc_round_segm'] = df['Annual_Inc_round'].apply(
lambda x: 1 if x <= 84999 else
2 if 85000 <= x <= 109999 else
3 if 110000 <= x <= 214999 else
4
).astype(int)
# Instalment_rounded
df['instlmnt_round_segm'] = df['instlmnt_round'].apply(
lambda x: 1 if x <= 479 else
2 if 480 <= x <= 699 else
3 if 700 <= x <= 879 else
4
).astype(int)
# home_ownership
home_ownership_map = {
'ANY': 1,
'OWN': 1,
'MORTGAGE': 2,
'RENT': 2
}
df['home_ownership_segm'] = df['home_ownership'].map(home_ownership_map).fillna(-1).astype(int)
# status_verified
status_verified_map = {
'Not Verified': 3,
'Source Verified': 2,
'Verified': 1
}
df['verification_status_segm'] = df['verification_status'].map(status_verified_map).fillna(-1).astype(int)
# purpose
purpose_map = {
'car': 1,
'house': 1,
'major_purchase': 1,
'medical': 1,
'moving': 1,
'small_business': 1,
'debt_consolidation': 2,
'home_improvement': 2,
'other': 2,
'renewable_energy': 2,
'vacation': 2,
'wedding': 2,
'credit_card': 3
}
df['purpose_segm'] = df['purpose'].map(purpose_map).fillna(-1).astype(int)
# years_with_Credit_line
df['years_with_Credit_line_segm'] = df['years_with_Credit_line'].apply(
lambda x: 1 if 10 <= x <= 18 else
1 if x >= 40 else
2
).astype(int)
# inq_last_6mths
df['inq_last_6mths_segm'] = df['inq_last_6mths'].apply(
lambda x: 1 if x >= 3 else
2 if x == 2 else
3 if x == 1 else
4
).astype(int)
# mths_since_last_delinq
df['mths_since_last_delinq_segm'] = df['mths_since_last_delinq'].apply(
lambda x: -1 if x == -1 else
1 if 0 <= x <= 45 else
2
).astype(int)
# mths_since_last_record
df['mths_since_last_record_segm'] = df['mths_since_last_record'].apply(
lambda x: -1 if x == -1 else
1 if 1 <= x <= 68 else
2
).astype(int)
# revol_bal_grouped
df['revol_bal_segm'] = df['revol_bal'].apply(
lambda x: 1 if x <= 9999 else
2 if 10000 <= x <= 19999 else
3 if 20000 <= x <= 39999 else
4
).astype(int)
# out_prncp_round
# df['out_prncp_round_segm'] = df['out_prncp_round'].apply(
# lambda x: 1 if x > 0 else
# 2
# ).astype(int)
# total_pymnt_round
df['total_pymnt_round_segm'] = df['total_pymnt_round'].apply(
lambda x: 1 if x <= 4999 else
2 if 5000 <= x <= 9999 else
3 if 10000 <= x <= 14999 else
4
).astype(int)
# mths_since_last_major_derog
df['mths_since_last_major_derog_segm'] = df['mths_since_last_major_derog'].apply(
lambda x: -1 if x == -1 else
1 if 0 <= x <= 45 else
2 if 46 <= x <= 78 else
3
).astype(int)
# application_type
application_type_map = {
'Joint_App': 1,
'Individual': 2
}
df['application_type_segm'] = df['application_type'].map(application_type_map).fillna(-1).astype(int)
# open_acc_6m
df['open_acc_6m_segm'] = df['open_acc_6m'].apply(
lambda x: 1 if x <= 1 else
2 if x > 1 else
-1
).astype(int)
# open_il_12m
df['open_il_12m_segm'] = df['open_il_12m'].apply(
lambda x: 1 if x <= 1 else
2 if x > 1 else
-1
).astype(int)
# open_rv_12m
df['open_rv_12m_segm'] = df['open_rv_12m'].apply(
lambda x: 1 if x <= 1 else
2 if x > 1 else
-1
).astype(int)
# open_rv_24m
df['open_rv_24m_segm'] = df['open_rv_24m'].apply(
lambda x: 1 if x <= 1 else
2 if 2 <= x <= 6 else
3 if x >= 7 else
-1
).astype(int)
# inq_last_12m
df['inq_last_12m_segm'] = df['inq_last_12m'].apply(
lambda x: 1 if x <= 1 else
2 if 2 <= x <= 6 else
3 if x >= 7 else
-1
).astype(int)
# acc_open_past_24mths
df['acc_open_past_24mths_segm'] = df['acc_open_past_24mths'].apply(
lambda x: 1 if x <= 1 else
2 if 2 <= x <= 6 else
3 if x >= 7 else
-1
).astype(int).astype(int)
# mort_acc
df['mort_acc_segm'] = df['mort_acc'].apply(
lambda x: 1 if x == 0 else
2 if x == 1 else
3 if x == 2 else
4
).astype(int)
return df
# Apply the segmentations - TRAINING set
X_train_segmented = segment_variables(X_train)
# Apply the segmentations - TEST set
X_test_segmented = segment_variables(X_test)
X_train_segmented
| term | installment | emp_length | home_ownership | annual_inc | verification_status | issue_d | purpose | addr_state | dti | delinq_2yrs | earliest_cr_line | fico_range_low | inq_last_6mths | mths_since_last_delinq | mths_since_last_record | open_acc | pub_rec | revol_bal | revol_util | total_acc | total_pymnt | recoveries | last_fico_range_high | mths_since_last_major_derog | application_type | annual_inc_joint | dti_joint | tot_coll_amt | tot_cur_bal | open_acc_6m | open_act_il | open_il_12m | open_il_24m | il_util | open_rv_12m | open_rv_24m | total_rev_hi_lim | total_cu_tl | inq_last_12m | acc_open_past_24mths | avg_cur_bal | delinq_amnt | mort_acc | mths_since_recent_bc_dlq | mths_since_recent_inq | num_accts_ever_120_pd | num_rev_accts | revol_bal_joint | dti_rounded | instlmnt_round | Annual_Inc_round | instlmnt_to_Annual_Inc | revol_bal_round | revol_util_round | total_pymnt_round | recoveries_round | avg_cur_bal_round | years_with_Credit_line | mths_since_recent_inq_segm | delinq_2yrs_segm | fico_range_low_segm | term_segm | emp_length_segm | dti_rounded_segm | Annual_Inc_round_segm | instlmnt_round_segm | home_ownership_segm | verification_status_segm | purpose_segm | years_with_Credit_line_segm | inq_last_6mths_segm | mths_since_last_delinq_segm | mths_since_last_record_segm | revol_bal_segm | total_pymnt_round_segm | mths_since_last_major_derog_segm | application_type_segm | open_acc_6m_segm | open_il_12m_segm | open_rv_12m_segm | open_rv_24m_segm | inq_last_12m_segm | acc_open_past_24mths_segm | mort_acc_segm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 months | 186.82 | 8 years | RENT | 50000.00 | Verified | 2018-03-01 | other | OK | 21.80 | 1.0 | 2009-01-01 | 665.0 | 0.0 | 9.0 | -1.0 | 5.0 | 0.0 | 116.0 | 23.2 | 18.0 | 2043.690000 | 0.0 | 609.0 | 9.0 | Individual | -1.0 | -1.00 | 0.0 | 19344.0 | 0.0 | 2.0 | 0.0 | 1.0 | 51.0 | 1.0 | 2.0 | 500.0 | 0.0 | 5.0 | 3.0 | 3869.0 | 0.0 | 0.0 | -1.0 | 2.0 | 4.0 | 2.0 | -1.0 | 22 | 187 | 50000 | 0.0 | 116 | 23 | 2044 | 0 | 3869 | 9 | 1 | 3 | 1 | 2 | 2 | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 4 | 1 | -1 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 |
| 1 | 36 months | 483.45 | 2 years | OWN | 196000.00 | Source Verified | 2018-03-01 | debt_consolidation | FL | 18.29 | 0.0 | 1998-07-01 | 700.0 | 0.0 | 65.0 | -1.0 | 19.0 | 0.0 | 24243.0 | 46.3 | 53.0 | 5301.420000 | 0.0 | 694.0 | -1.0 | Individual | -1.0 | -1.00 | 0.0 | 534954.0 | 4.0 | 3.0 | 2.0 | 2.0 | 59.0 | 4.0 | 12.0 | 52400.0 | 1.0 | 7.0 | 15.0 | 31468.0 | 0.0 | 5.0 | -1.0 | 6.0 | 0.0 | 37.0 | -1.0 | 18 | 483 | 196000 | 0.0 | 24243 | 46 | 5301 | 0 | 31468 | 20 | 2 | 4 | 2 | 2 | 2 | 2 | 3 | 2 | 1 | 2 | 2 | 2 | 4 | 2 | -1 | 3 | 2 | -1 | 2 | 2 | 2 | 2 | 3 | 3 | 3 | 4 |
| 2 | 60 months | 367.82 | < 1 year | RENT | 44000.00 | Not Verified | 2018-03-01 | medical | NH | 43.97 | 1.0 | 2007-07-01 | 665.0 | 2.0 | 6.0 | -1.0 | 8.0 | 0.0 | 1526.0 | 24.6 | 14.0 | 4007.700000 | 0.0 | 629.0 | 70.0 | Joint App | 81000.0 | 31.94 | 0.0 | 67173.0 | 1.0 | 4.0 | 1.0 | 4.0 | 89.0 | 1.0 | 1.0 | 6200.0 | 1.0 | 10.0 | 5.0 | 8397.0 | 0.0 | 0.0 | 35.0 | 0.0 | 1.0 | 6.0 | 7101.0 | 44 | 368 | 44000 | 0.0 | 1526 | 25 | 4008 | 0 | 8397 | 11 | 1 | 3 | 1 | 1 | 2 | 2 | 1 | 1 | 2 | 3 | 1 | 1 | 2 | 1 | -1 | 1 | 1 | 2 | -1 | 1 | 1 | 1 | 1 | 3 | 2 | 1 |
| 3 | 60 months | 688.35 | 10+ years | MORTGAGE | 65000.00 | Source Verified | 2018-03-01 | debt_consolidation | AL | 12.89 | 1.0 | 1995-03-01 | 665.0 | 1.0 | 22.0 | -1.0 | 7.0 | 0.0 | 8657.0 | 98.4 | 16.0 | 7511.160000 | 0.0 | 669.0 | 23.0 | Individual | -1.0 | -1.00 | 0.0 | 74795.0 | 0.0 | 2.0 | 0.0 | 2.0 | 82.0 | 0.0 | 0.0 | 8800.0 | 3.0 | 3.0 | 2.0 | 10685.0 | 0.0 | 2.0 | -1.0 | 0.0 | 2.0 | 9.0 | -1.0 | 13 | 688 | 65000 | 0.0 | 8657 | 98 | 7511 | 0 | 10685 | 23 | 1 | 3 | 1 | 1 | 1 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 3 | 1 | -1 | 1 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 3 |
| 4 | 36 months | 93.10 | 9 years | RENT | 52000.00 | Source Verified | 2018-03-01 | major_purchase | WA | 0.58 | 0.0 | 1998-01-01 | 760.0 | 0.0 | 26.0 | -1.0 | 7.0 | 0.0 | 141.0 | 0.5 | 30.0 | 3011.577285 | 0.0 | 764.0 | -1.0 | Individual | -1.0 | -1.00 | 0.0 | 150592.0 | 0.0 | 0.0 | 1.0 | 2.0 | -1.0 | 0.0 | 1.0 | 31000.0 | 2.0 | 2.0 | 3.0 | 25099.0 | 0.0 | 4.0 | -1.0 | 7.0 | 0.0 | 19.0 | -1.0 | 1 | 93 | 52000 | 0.0 | 141 | 0 | 3012 | 0 | 25099 | 20 | 2 | 4 | 4 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 1 | 2 | 4 | 1 | -1 | 1 | 1 | -1 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 4 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 60 months | 270.71 | MISSING | MORTGAGE | 89625.39 | Not Verified | 2018-01-01 | debt_consolidation | CA | 17.61 | 0.0 | 1982-01-01 | 660.0 | 0.0 | 40.0 | -1.0 | 10.0 | 0.0 | 18601.0 | 90.3 | 37.0 | 12483.154233 | 0.0 | 689.0 | 40.0 | Individual | -1.0 | -1.00 | 0.0 | 473894.0 | 1.0 | 2.0 | 2.0 | 2.0 | 88.0 | 1.0 | 1.0 | 20600.0 | 0.0 | 0.0 | 3.0 | 47389.0 | 0.0 | 1.0 | 40.0 | -1.0 | 16.0 | 27.0 | -1.0 | 18 | 271 | 89625 | 0.0 | 18601 | 90 | 12483 | 0 | 47389 | 36 | -1 | 4 | 1 | 1 | -1 | 2 | 2 | 1 | 2 | 3 | 2 | 2 | 4 | 1 | -1 | 2 | 3 | 1 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | 2 |
| 107860 | 36 months | 149.70 | 10+ years | MORTGAGE | 52000.00 | Not Verified | 2018-01-01 | home_improvement | IL | 33.72 | 0.0 | 1994-02-01 | 690.0 | 0.0 | -1.0 | -1.0 | 22.0 | 0.0 | 28116.0 | 49.2 | 41.0 | 2092.380000 | 0.0 | 684.0 | -1.0 | Individual | -1.0 | -1.00 | 249.0 | 217780.0 | 1.0 | 2.0 | 0.0 | 2.0 | 66.0 | 1.0 | 3.0 | 57200.0 | 0.0 | 0.0 | 5.0 | 9899.0 | 0.0 | 3.0 | -1.0 | 17.0 | 0.0 | 34.0 | -1.0 | 34 | 150 | 52000 | 0.0 | 28116 | 49 | 2092 | 0 | 9899 | 24 | 3 | 4 | 2 | 2 | 1 | 2 | 1 | 1 | 2 | 3 | 2 | 2 | 4 | -1 | -1 | 3 | 1 | -1 | 2 | 1 | 1 | 1 | 2 | 1 | 2 | 4 |
| 107861 | 36 months | 196.18 | 10+ years | MORTGAGE | 50000.00 | Source Verified | 2018-01-01 | debt_consolidation | NH | 28.93 | 0.0 | 1997-06-01 | 690.0 | 0.0 | 58.0 | -1.0 | 11.0 | 0.0 | 6950.0 | 51.9 | 14.0 | 2742.880000 | 0.0 | 819.0 | 58.0 | Individual | -1.0 | -1.00 | 0.0 | 230614.0 | 0.0 | 1.0 | 1.0 | 1.0 | 83.0 | 1.0 | 1.0 | 13400.0 | 0.0 | 2.0 | 2.0 | 20965.0 | 0.0 | 2.0 | 58.0 | 7.0 | 1.0 | 11.0 | -1.0 | 29 | 196 | 50000 | 0.0 | 6950 | 52 | 2743 | 0 | 20965 | 21 | 2 | 4 | 2 | 2 | 1 | 2 | 1 | 1 | 2 | 2 | 2 | 2 | 4 | 2 | -1 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 3 |
| 107862 | 36 months | 389.58 | 8 years | MORTGAGE | 36000.00 | Verified | 2018-01-01 | debt_consolidation | IN | 11.10 | 1.0 | 1998-05-01 | 685.0 | 0.0 | 21.0 | -1.0 | 14.0 | 0.0 | 11648.0 | 43.6 | 18.0 | 5593.050000 | 0.0 | 694.0 | 21.0 | Individual | -1.0 | -1.00 | 0.0 | 191131.0 | 2.0 | 1.0 | 0.0 | 0.0 | -1.0 | 3.0 | 6.0 | 26700.0 | 0.0 | 1.0 | 6.0 | 14702.0 | 0.0 | 1.0 | -1.0 | 11.0 | 1.0 | 12.0 | -1.0 | 11 | 390 | 36000 | 0.0 | 11648 | 44 | 5593 | 0 | 14702 | 20 | 3 | 3 | 2 | 2 | 2 | 2 | 1 | 1 | 2 | 1 | 2 | 2 | 4 | 1 | -1 | 2 | 2 | 1 | 2 | 2 | 1 | 2 | 2 | 1 | 2 | 2 |
| 107863 | 36 months | 475.71 | 2 years | OWN | 80000.00 | Source Verified | 2018-01-01 | car | CA | 1.35 | 0.0 | 2007-07-01 | 660.0 | 1.0 | 31.0 | -1.0 | 11.0 | 0.0 | 1461.0 | 4.1 | 21.0 | 14662.947011 | 0.0 | 674.0 | 31.0 | Individual | -1.0 | -1.00 | 0.0 | 1461.0 | 1.0 | 0.0 | 0.0 | 1.0 | -1.0 | 1.0 | 2.0 | 35300.0 | 0.0 | 2.0 | 3.0 | 162.0 | 0.0 | 0.0 | 31.0 | 0.0 | 5.0 | 19.0 | -1.0 | 1 | 476 | 80000 | 0.0 | 1461 | 4 | 14663 | 0 | 162 | 11 | 1 | 4 | 1 | 2 | 2 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 3 | 1 | -1 | 1 | 3 | 1 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 |
107864 rows × 85 columns
print(X_train_segmented.shape)
print(X_test_segmented.shape)
(107864, 85) (42928, 85)
# Crosstab with absolute volumes
crosstab_absolute1 = pd.crosstab(X_train_segmented['purpose_segm'], X_train_segmented['purpose'])
crosstab_absolute2 = pd.crosstab(X_train_segmented['term_segm'], X_train_segmented['term'])
crosstab_absolute3 = pd.crosstab(X_train_segmented['years_with_Credit_line_segm'], X_train_segmented['years_with_Credit_line'])
crosstab_absolute4 = pd.crosstab(X_train_segmented['home_ownership_segm'], X_train_segmented['home_ownership'])
crosstab_absolute5 = pd.crosstab(X_train_segmented['application_type_segm'], X_train_segmented['application_type'])
crosstab_absolute6 = pd.crosstab(X_train_segmented['delinq_2yrs_segm'], X_train_segmented['delinq_2yrs'])
crosstab_absolute7 = pd.crosstab(X_train_segmented['mths_since_last_delinq_segm'], X_train_segmented['mths_since_last_delinq'])
crosstab_absolute8 = pd.crosstab(X_train_segmented['mths_since_last_record_segm'], X_train_segmented['mths_since_last_record'])
crosstab_absolute9 = pd.crosstab(X_train_segmented['mths_since_last_major_derog_segm'], X_train_segmented['mths_since_last_major_derog'])
crosstab_absolute10 = pd.crosstab(X_train_segmented['open_rv_24m_segm'], X_train_segmented['open_rv_24m'])
crosstab_absolute11 = pd.crosstab(X_train_segmented['open_rv_12m_segm'], X_train_segmented['open_rv_12m'])
crosstab_absolute12 = pd.crosstab(X_train_segmented['open_acc_6m_segm'], X_train_segmented['open_acc_6m'])
crosstab_absolute13 = pd.crosstab(X_train_segmented['mort_acc_segm'], X_train_segmented['mort_acc'])
crosstab_absolute14 = pd.crosstab(X_train_segmented['emp_length_segm'], X_train_segmented['emp_length'])
crosstab_absolute15 = pd.crosstab(X_train_segmented['open_il_12m_segm'], X_train_segmented['open_il_12m'])
crosstab_absolute16 = pd.crosstab(X_train_segmented['mths_since_recent_inq_segm'], X_train_segmented['mths_since_recent_inq'])
crosstab_absolute17 = pd.crosstab(X_train_segmented['inq_last_6mths_segm'], X_train_segmented['inq_last_6mths'])
crosstab_absolute18 = pd.crosstab(X_train_segmented['inq_last_12m_segm'], X_train_segmented['inq_last_12m'])
crosstab_absolute19 = pd.crosstab(X_train_segmented['fico_range_low_segm'], X_train_segmented['fico_range_low'])
crosstab_absolute20 = pd.crosstab(X_train_segmented['acc_open_past_24mths_segm'], X_train_segmented['acc_open_past_24mths'])
crosstab_absolute21 = pd.crosstab(X_train_segmented['verification_status_segm'], X_train_segmented['verification_status'])
# Continuous variables
crosstab_absolute22 = pd.crosstab(X_train_segmented['revol_bal_segm'], X_train_segmented['revol_bal'])
crosstab_absolute23 = pd.crosstab(X_train_segmented['Annual_Inc_round_segm'], X_train_segmented['Annual_Inc_round'])
# crosstab_absolute24 = pd.crosstab(X_train_segmented['out_prncp_round_segm'], X_train_segmented['out_prncp_round'])
crosstab_absolute25 = pd.crosstab(X_train_segmented['total_pymnt_round_segm'], X_train_segmented['total_pymnt_round'])
crosstab_absolute26 = pd.crosstab(X_train_segmented['instlmnt_round_segm'], X_train_segmented['instlmnt_round'])
crosstab_absolute27 = pd.crosstab(X_train_segmented['dti_rounded_segm'], X_train_segmented['dti_rounded'])
# Display the results
# print("Crosstab with Absolute Volumes:\n", crosstab_absolute)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute1)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute2)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute3)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute4)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute5)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute6)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute7)
Crosstab with Absolute Volumes: purpose car credit_card debt_consolidation home_improvement house \ purpose_segm 1 1361 0 0 0 1579 2 0 0 55083 7640 0 3 0 24577 0 0 0 purpose major_purchase medical moving other renewable_energy \ purpose_segm 1 3217 1765 745 0 0 2 0 0 0 9734 63 3 0 0 0 0 0 purpose small_business vacation wedding purpose_segm 1 1346 0 0 2 0 749 5 3 0 0 0 Crosstab with Absolute Volumes: term 36 months 60 months term_segm 1 0 32452 2 75412 0 Crosstab with Absolute Volumes: years_with_Credit_line 3 4 5 6 7 8 9 10 \ years_with_Credit_line_segm 1 0 0 0 0 0 0 0 3909 2 143 2262 2403 2530 2869 2644 2551 0 years_with_Credit_line 11 12 13 14 15 16 17 18 \ years_with_Credit_line_segm 1 6547 7936 7936 7453 6729 5653 5297 4902 2 0 0 0 0 0 0 0 0 years_with_Credit_line 19 20 21 22 23 24 25 26 \ years_with_Credit_line_segm 1 0 0 0 0 0 0 0 0 2 4267 3673 3209 3088 2914 2538 2115 1499 years_with_Credit_line 27 28 29 30 31 32 33 34 35 \ years_with_Credit_line_segm 1 0 0 0 0 0 0 0 0 0 2 1411 1465 1413 1146 1132 907 839 780 631 years_with_Credit_line 36 37 38 39 40 41 42 43 44 45 \ years_with_Credit_line_segm 1 0 0 0 0 247 261 189 143 148 126 2 493 354 266 335 0 0 0 0 0 0 years_with_Credit_line 46 47 48 49 50 51 52 53 54 55 56 57 \ years_with_Credit_line_segm 1 93 72 63 64 54 48 26 28 25 13 6 5 2 0 0 0 0 0 0 0 0 0 0 0 0 years_with_Credit_line 58 59 60 62 65 67 68 years_with_Credit_line_segm 1 4 4 1 1 1 2 1 2 0 0 0 0 0 0 0 Crosstab with Absolute Volumes: home_ownership ANY MORTGAGE OWN RENT home_ownership_segm 1 4 0 14010 0 2 0 51874 0 41976 Crosstab with Absolute Volumes: application_type Individual Joint App application_type_segm -1 0 16331 2 91533 0 Crosstab with Absolute Volumes: delinq_2yrs 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 \ delinq_2yrs_segm 1 0 0 0 902 419 193 108 54 41 2 0 0 2727 0 0 0 0 0 0 3 0 10916 0 0 0 0 0 0 0 4 92407 0 0 0 0 0 0 0 0 delinq_2yrs 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 19.0 \ delinq_2yrs_segm 1 26 32 15 9 3 3 2 1 2 2 2 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 delinq_2yrs 20.0 delinq_2yrs_segm 1 2 2 0 3 0 4 0 Crosstab with Absolute Volumes: mths_since_last_delinq -1.0 0.0 1.0 2.0 3.0 4.0 \ mths_since_last_delinq_segm -1 60695 0 0 0 0 0 1 0 9 91 237 449 553 2 0 0 0 0 0 0 mths_since_last_delinq 5.0 6.0 7.0 8.0 9.0 10.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 564 718 700 628 639 629 2 0 0 0 0 0 0 mths_since_last_delinq 11.0 12.0 13.0 14.0 15.0 16.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 722 910 912 810 751 755 2 0 0 0 0 0 0 mths_since_last_delinq 17.0 18.0 19.0 20.0 21.0 22.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 686 767 790 667 686 721 2 0 0 0 0 0 0 mths_since_last_delinq 23.0 24.0 25.0 26.0 27.0 28.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 793 884 912 857 765 771 2 0 0 0 0 0 0 mths_since_last_delinq 29.0 30.0 31.0 32.0 33.0 34.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 790 745 645 688 677 714 2 0 0 0 0 0 0 mths_since_last_delinq 35.0 36.0 37.0 38.0 39.0 40.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 669 748 802 755 691 616 2 0 0 0 0 0 0 mths_since_last_delinq 41.0 42.0 43.0 44.0 45.0 46.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 667 631 630 607 612 0 2 0 0 0 0 0 613 mths_since_last_delinq 47.0 48.0 49.0 50.0 51.0 52.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 679 683 579 495 431 478 mths_since_last_delinq 53.0 54.0 55.0 56.0 57.0 58.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 433 419 429 410 419 449 mths_since_last_delinq 59.0 60.0 61.0 62.0 63.0 64.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 473 490 495 481 458 417 mths_since_last_delinq 65.0 66.0 67.0 68.0 69.0 70.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 412 432 454 397 390 394 mths_since_last_delinq 71.0 72.0 73.0 74.0 75.0 76.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 403 411 433 407 412 364 mths_since_last_delinq 77.0 78.0 79.0 80.0 81.0 82.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 320 333 328 276 228 126 mths_since_last_delinq 83.0 84.0 85.0 86.0 87.0 88.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 48 23 12 5 6 12 mths_since_last_delinq 89.0 90.0 91.0 92.0 93.0 94.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 7 5 7 5 4 6 mths_since_last_delinq 95.0 96.0 97.0 98.0 99.0 100.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 5 4 8 4 7 4 mths_since_last_delinq 101.0 102.0 103.0 104.0 105.0 106.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 11 3 8 4 3 2 mths_since_last_delinq 107.0 108.0 109.0 110.0 111.0 112.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 3 5 3 5 4 7 mths_since_last_delinq 113.0 114.0 115.0 116.0 118.0 119.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 1 1 2 3 1 mths_since_last_delinq 120.0 122.0 125.0 126.0 130.0 131.0 \ mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 2 1 1 2 2 1 mths_since_last_delinq 133.0 138.0 156.0 158.0 160.0 226.0 mths_since_last_delinq_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 1 1 1 1 1 1
print("Crosstab with Absolute Volumes:\n", crosstab_absolute8)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute9)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute10)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute11)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute12)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute13)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute14)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute15)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute16)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute17)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute18)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute19)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute20)
print("Crosstab with Absolute Volumes:\n", crosstab_absolute21)
Crosstab with Absolute Volumes: mths_since_last_record -1.0 1.0 2.0 3.0 4.0 5.0 \ mths_since_last_record_segm -1 92595 0 0 0 0 0 1 0 3 2 13 12 15 2 0 0 0 0 0 0 mths_since_last_record 6.0 7.0 8.0 9.0 10.0 11.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 16 17 27 25 24 21 2 0 0 0 0 0 0 mths_since_last_record 12.0 13.0 14.0 15.0 16.0 17.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 24 24 22 28 22 32 2 0 0 0 0 0 0 mths_since_last_record 18.0 19.0 20.0 21.0 22.0 23.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 26 30 24 25 30 35 2 0 0 0 0 0 0 mths_since_last_record 24.0 25.0 26.0 27.0 28.0 29.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 24 28 22 30 38 49 2 0 0 0 0 0 0 mths_since_last_record 30.0 31.0 32.0 33.0 34.0 35.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 49 36 63 47 55 54 2 0 0 0 0 0 0 mths_since_last_record 36.0 37.0 38.0 39.0 40.0 41.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 40 42 61 64 62 60 2 0 0 0 0 0 0 mths_since_last_record 42.0 43.0 44.0 45.0 46.0 47.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 81 70 78 93 85 106 2 0 0 0 0 0 0 mths_since_last_record 48.0 49.0 50.0 51.0 52.0 53.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 105 87 94 74 91 102 2 0 0 0 0 0 0 mths_since_last_record 54.0 55.0 56.0 57.0 58.0 59.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 128 125 122 143 161 145 2 0 0 0 0 0 0 mths_since_last_record 60.0 61.0 62.0 63.0 64.0 65.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 121 129 122 161 154 159 2 0 0 0 0 0 0 mths_since_last_record 66.0 67.0 68.0 69.0 70.0 71.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 152 165 143 0 0 0 2 0 0 0 182 200 191 mths_since_last_record 72.0 73.0 74.0 75.0 76.0 77.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 181 169 208 200 207 218 mths_since_last_record 78.0 79.0 80.0 81.0 82.0 83.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 224 246 233 233 296 248 mths_since_last_record 84.0 85.0 86.0 87.0 88.0 89.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 270 255 230 252 260 269 mths_since_last_record 90.0 91.0 92.0 93.0 94.0 95.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 245 263 270 267 299 240 mths_since_last_record 96.0 97.0 98.0 99.0 100.0 101.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 224 218 187 255 224 229 mths_since_last_record 102.0 103.0 104.0 105.0 106.0 107.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 238 209 258 248 218 224 mths_since_last_record 108.0 109.0 110.0 111.0 112.0 113.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 163 166 169 163 159 140 mths_since_last_record 114.0 115.0 116.0 117.0 118.0 119.0 \ mths_since_last_record_segm -1 0 0 0 0 0 0 1 0 0 0 0 0 0 2 151 134 115 128 116 30 mths_since_last_record 120.0 121.0 122.0 123.0 124.0 mths_since_last_record_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 10 4 1 13 2 Crosstab with Absolute Volumes: mths_since_last_major_derog -1.0 0.0 1.0 2.0 3.0 \ mths_since_last_major_derog_segm -1 83103 0 0 0 0 1 0 9 11 18 69 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 4.0 5.0 6.0 7.0 8.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 97 109 137 148 154 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 9.0 10.0 11.0 12.0 13.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 174 184 190 239 250 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 14.0 15.0 16.0 17.0 18.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 255 218 221 219 271 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 19.0 20.0 21.0 22.0 23.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 247 233 247 268 270 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 24.0 25.0 26.0 27.0 28.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 336 373 306 284 341 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 29.0 30.0 31.0 32.0 33.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 300 355 307 348 334 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 34.0 35.0 36.0 37.0 38.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 396 366 410 420 405 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 39.0 40.0 41.0 42.0 43.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 353 349 364 331 364 2 0 0 0 0 0 3 0 0 0 0 0 mths_since_last_major_derog 44.0 45.0 46.0 47.0 48.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 322 375 0 0 0 2 0 0 371 406 434 3 0 0 0 0 0 mths_since_last_major_derog 49.0 50.0 51.0 52.0 53.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 398 360 309 350 353 3 0 0 0 0 0 mths_since_last_major_derog 54.0 55.0 56.0 57.0 58.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 362 350 328 362 415 3 0 0 0 0 0 mths_since_last_major_derog 59.0 60.0 61.0 62.0 63.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 402 394 379 356 384 3 0 0 0 0 0 mths_since_last_major_derog 64.0 65.0 66.0 67.0 68.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 361 344 349 396 364 3 0 0 0 0 0 mths_since_last_major_derog 69.0 70.0 71.0 72.0 73.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 329 319 351 322 359 3 0 0 0 0 0 mths_since_last_major_derog 74.0 75.0 76.0 77.0 78.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 339 332 275 245 235 3 0 0 0 0 0 mths_since_last_major_derog 79.0 80.0 81.0 82.0 83.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 247 194 165 100 44 mths_since_last_major_derog 84.0 85.0 86.0 87.0 88.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 22 16 12 15 23 mths_since_last_major_derog 89.0 90.0 91.0 92.0 93.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 12 11 16 11 11 mths_since_last_major_derog 94.0 95.0 96.0 97.0 98.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 8 10 6 16 12 mths_since_last_major_derog 99.0 100.0 101.0 102.0 103.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 15 8 21 8 14 mths_since_last_major_derog 104.0 105.0 106.0 107.0 108.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 9 7 9 9 8 mths_since_last_major_derog 109.0 110.0 111.0 112.0 113.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 8 8 9 10 5 mths_since_last_major_derog 114.0 115.0 116.0 117.0 118.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 5 6 2 3 5 mths_since_last_major_derog 119.0 120.0 121.0 122.0 123.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 1 5 1 2 2 mths_since_last_major_derog 125.0 126.0 128.0 130.0 131.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 2 2 2 2 1 mths_since_last_major_derog 132.0 133.0 138.0 139.0 145.0 \ mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 2 1 1 1 1 mths_since_last_major_derog 153.0 156.0 158.0 160.0 226.0 mths_since_last_major_derog_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 1 1 1 1 1 Crosstab with Absolute Volumes: open_rv_24m 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 \ open_rv_24m_segm 1 18931 24411 0 0 0 0 0 0 0 2 0 0 20872 14872 10004 6704 4209 0 0 3 0 0 0 0 0 0 0 2732 1719 open_rv_24m 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 \ open_rv_24m_segm 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 1124 721 512 318 213 166 101 67 55 43 open_rv_24m 19.0 20.0 21.0 22.0 23.0 24.0 25.0 26.0 27.0 29.0 \ open_rv_24m_segm 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 21 17 13 12 6 3 6 4 1 4 open_rv_24m 34.0 35.0 38.0 open_rv_24m_segm 1 0 0 0 2 0 0 0 3 1 1 1 Crosstab with Absolute Volumes: open_rv_12m 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 \ open_rv_12m_segm 1 42323 31776 0 0 0 0 0 0 0 2 0 0 17357 8443 3980 2042 971 446 243 open_rv_12m 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 \ open_rv_12m_segm 1 0 0 0 0 0 0 0 0 0 0 2 133 59 38 20 16 6 4 3 1 1 open_rv_12m 19.0 open_rv_12m_segm 1 0 2 2 Crosstab with Absolute Volumes: open_acc_6m 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 \ open_acc_6m_segm 1 49996 33005 0 0 0 0 0 0 0 2 0 0 15358 6116 2180 757 254 113 57 open_acc_6m 9.0 10.0 12.0 open_acc_6m_segm 1 0 0 0 2 21 4 3 Crosstab with Absolute Volumes: mort_acc 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 9.0 \ mort_acc_segm 1 48091 0 0 0 0 0 0 0 0 0 2 0 19449 0 0 0 0 0 0 0 0 3 0 0 16283 0 0 0 0 0 0 0 4 0 0 0 10988 6473 3385 1647 797 371 171 mort_acc 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 22.0 \ mort_acc_segm 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0 0 4 98 44 22 14 7 7 4 2 2 2 mort_acc 23.0 24.0 25.0 31.0 46.0 mort_acc_segm 1 0 0 0 0 0 2 0 0 0 0 0 3 0 0 0 0 0 4 2 2 1 1 1 Crosstab with Absolute Volumes: emp_length 1 year 10+ years 2 years 3 years 4 years 5 years \ emp_length_segm -1 0 0 0 0 0 0 1 0 35706 0 0 0 0 2 7169 0 10191 9179 6918 6815 emp_length 6 years 7 years 8 years 9 years < 1 year MISSING emp_length_segm -1 0 0 0 0 0 9428 1 0 0 0 3123 0 0 2 4716 4002 3278 0 7339 0 Crosstab with Absolute Volumes: open_il_12m 0.0 1.0 2.0 3.0 4.0 5.0 6.0 8.0 open_il_12m_segm 1 60679 31424 0 0 0 0 0 0 2 0 0 10938 3261 1092 350 119 1 Crosstab with Absolute Volumes: mths_since_recent_inq -1.0 0.0 1.0 2.0 3.0 4.0 5.0 \ mths_since_recent_inq_segm -1 13648 0 0 0 0 0 0 1 0 7360 8716 8012 0 0 0 2 0 0 0 0 7155 6805 6183 3 0 0 0 0 0 0 0 mths_since_recent_inq 6.0 7.0 8.0 9.0 10.0 11.0 12.0 \ mths_since_recent_inq_segm -1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 5784 5741 4902 4178 3788 0 0 3 0 0 0 0 0 3168 2887 mths_since_recent_inq 13.0 14.0 15.0 16.0 17.0 18.0 19.0 \ mths_since_recent_inq_segm -1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 3 2813 2636 2174 2025 1852 1771 1586 mths_since_recent_inq 20.0 21.0 22.0 23.0 24.0 mths_since_recent_inq_segm -1 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 0 0 3 1289 1130 985 859 417 Crosstab with Absolute Volumes: inq_last_6mths 0.0 1.0 2.0 3.0 4.0 5.0 inq_last_6mths_segm 1 0 0 0 2492 132 36 2 0 0 7996 0 0 0 3 0 25578 0 0 0 0 4 71630 0 0 0 0 0 Crosstab with Absolute Volumes: inq_last_12m 0.0 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 \ inq_last_12m_segm 1 33209 26406 0 0 0 0 0 0 0 2 0 0 17422 11290 7094 4358 2800 0 0 3 0 0 0 0 0 0 0 1718 1165 inq_last_12m 9.0 10.0 11.0 12.0 13.0 14.0 15.0 16.0 17.0 18.0 \ inq_last_12m_segm 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 761 478 368 251 154 104 65 49 42 41 inq_last_12m 19.0 20.0 21.0 22.0 23.0 24.0 25.0 26.0 27.0 29.0 \ inq_last_12m_segm 1 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 3 25 14 9 11 2 4 3 6 3 3 inq_last_12m 30.0 31.0 33.0 39.0 40.0 45.0 46.0 inq_last_12m_segm 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 3 1 2 1 2 1 1 1 Crosstab with Absolute Volumes: fico_range_low 660.0 665.0 670.0 675.0 680.0 685.0 690.0 695.0 \ fico_range_low_segm 1 6326 6339 6452 5991 6613 0 0 0 2 0 0 0 0 0 6128 6083 5772 3 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 fico_range_low 700.0 705.0 710.0 715.0 720.0 725.0 730.0 735.0 \ fico_range_low_segm 1 0 0 0 0 0 0 0 0 2 5875 0 0 0 0 0 0 0 3 0 5542 5173 4700 4674 0 0 0 4 0 0 0 0 0 3849 3650 2965 5 0 0 0 0 0 0 0 0 fico_range_low 740.0 745.0 750.0 755.0 760.0 765.0 770.0 775.0 \ fico_range_low_segm 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 4 2929 2319 2176 1890 1674 1607 1339 1267 5 0 0 0 0 0 0 0 0 fico_range_low 780.0 785.0 790.0 795.0 800.0 805.0 810.0 815.0 \ fico_range_low_segm 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 4 1160 899 868 0 0 0 0 0 5 0 0 0 672 665 579 442 367 fico_range_low 820.0 825.0 830.0 835.0 840.0 845.0 fico_range_low_segm 1 0 0 0 0 0 0 2 0 0 0 0 0 0 3 0 0 0 0 0 0 4 0 0 0 0 0 0 5 291 262 141 88 54 43 Crosstab with Absolute Volumes: acc_open_past_24mths 0.0 1.0 2.0 3.0 4.0 5.0 6.0 \ acc_open_past_24mths_segm 1 5580 11992 0 0 0 0 0 2 0 0 16061 16273 14819 11841 9159 3 0 0 0 0 0 0 0 acc_open_past_24mths 7.0 8.0 9.0 10.0 11.0 12.0 13.0 14.0 \ acc_open_past_24mths_segm 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 6563 4709 3377 2375 1553 1137 746 515 acc_open_past_24mths 15.0 16.0 17.0 18.0 19.0 20.0 21.0 22.0 \ acc_open_past_24mths_segm 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 344 228 192 119 78 47 40 36 acc_open_past_24mths 23.0 24.0 25.0 26.0 27.0 28.0 29.0 30.0 \ acc_open_past_24mths_segm 1 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 3 20 16 13 6 6 4 5 4 acc_open_past_24mths 31.0 33.0 35.0 36.0 37.0 38.0 acc_open_past_24mths_segm 1 0 0 0 0 0 0 2 0 0 0 0 0 0 3 1 1 1 1 1 1 Crosstab with Absolute Volumes: verification_status Not Verified Source Verified Verified verification_status_segm 1 0 0 24769 2 0 41644 0 3 41451 0 0
# X_train_segmented is the dataset with segmented variables
# target_variable is the target variable with default information
iv_df = calculate_iv(X_train_segmented, y_train)
print(iv_df)
Variable IV 16 total_pymnt_round_segm 55.609418 0 mths_since_recent_inq_segm 7.757868 9 verification_status_segm 7.537108 12 inq_last_6mths_segm 7.337686 2 fico_range_low_segm 6.496426 23 inq_last_12m_segm 4.925817 3 term_segm 4.768724 10 purpose_segm 3.344770 7 instlmnt_round_segm 3.042972 25 mort_acc_segm 2.967846 4 emp_length_segm 2.744972 19 open_acc_6m_segm 2.630914 24 acc_open_past_24mths_segm 2.506123 22 open_rv_24m_segm 2.240605 21 open_rv_12m_segm 2.041305 15 revol_bal_segm 1.635041 6 Annual_Inc_round_segm 1.607459 17 mths_since_last_major_derog_segm 1.044393 20 open_il_12m_segm 0.756474 1 delinq_2yrs_segm 0.495526 13 mths_since_last_delinq_segm 0.396014 14 mths_since_last_record_segm 0.387369 5 dti_rounded_segm 0.372996 18 application_type_segm 0.369479 11 years_with_Credit_line_segm 0.103906 8 home_ownership_segm 0.062790
# Mapping dictionary for legends
mapping_dict = {
'mths_since_recent_inq_segm': {
'0-2 months': 1,
'3-10 months': 2,
'11+ months': 3,
'MISSING': -1
},
'delinq_2yrs_segm': {
'3+ delinquencies': 1,
'2 delinquencies': 2,
'1 delinquency': 3,
'0 delinquencies': 4
},
'fico_range_low_segm': {
'<=680': 1,
'685-700': 2,
'705-720': 3,
'725-790': 4,
'791+': 5
},
'term_segm': {
'60 months': 1,
'36 months': 2
},
'emp_length_segm': {
'10+ years': 1,
'9 years': 1,
'1-8 years': 2,
'< 1 year': 2,
'MISSING': -1
},
'dti_rounded_segm': {
'0-7': 1,
'8+': 2,
'MISSING': -1
},
'Annual_Inc_round_segm': {
'<=84999': 1,
'85000-109999': 2,
'110000-214999': 3,
'215000+': 4
},
'instlmnt_round_segm': {
'<=479': 1,
'480-699': 2,
'700-879': 3,
'880+': 4
},
'home_ownership_segm': {
'ANY, OWN': 1,
'MORTGAGE, RENT': 2
},
'verification_status_segm': {
'Verified': 1,
'Source Verified': 2,
'Not Verified': 3
},
'purpose_segm': {
'car, house, major_purchase, medical, moving, small_business': 1,
'debt_consolidation, home_improvement, other, renewable_energy, vacation, wedding': 2,
'credit_card': 3
},
'years_with_Credit_line_segm': {
'10-18 years': 1,
'40+ years': 1,
'Other': 2
},
'inq_last_6mths_segm': {
'3+ inquiries': 1,
'2 inquiries': 2,
'1 inquiry': 3,
'0 inquiries': 4
},
'mths_since_last_delinq_segm': {
'0-45 months': 1,
'46+': 2,
'MISSING': -1
},
'mths_since_last_record_segm': {
'1-68 months': 1,
'69+ months': 2,
'MISSING': -1
},
'revol_bal_segm': {
'<=9999': 1,
'10000-19999': 2,
'20000-39999': 3,
'40000+': 4
},
# 'out_prncp_round_segm': {
# '>0': 1,
# '0': 2
# },
'total_pymnt_round_segm': {
'<=4999': 1,
'5000-9999': 2,
'10000-14999': 3,
'15000+': 4
},
'mths_since_last_major_derog_segm': {
'0-45 months': 1,
'46-78 months': 2,
'79+ months': 3,
'MISSING': -1
},
'application_type_segm': {
'Joint_App': 1,
'Individual': 2
},
'open_acc_6m_segm': {
'<=1': 1,
'>1': 2,
'MISSING': -1
},
'open_il_12m_segm': {
'<=1': 1,
'>1': 2,
'MISSING': -1
},
'open_rv_12m_segm': {
'<=1': 1,
'>1': 2,
'MISSING': -1
},
'open_rv_24m_segm': {
'<=1': 1,
'2-6': 2,
'7+': 3,
'MISSING': -1
},
'inq_last_12m_segm': {
'<=1': 1,
'2-6': 2,
'7+': 3,
'MISSING': -1
},
'acc_open_past_24mths_segm': {
'<=1': 1,
'2-6': 2,
'7+': 3,
'MISSING': -1
},
'mort_acc_segm': {
'0': 1,
'1': 2,
'2': 3,
'3+': 4
}
}
# List of segmented variables
segmented_vars = [
# 'out_prncp_round_segm',
'total_pymnt_round_segm',
'verification_status_segm',
'mths_since_recent_inq_segm',
'inq_last_6mths_segm',
'inq_last_12m_segm',
'fico_range_low_segm',
'purpose_segm',
'instlmnt_round_segm',
'acc_open_past_24mths_segm',
'term_segm',
'open_rv_24m_segm',
'open_acc_6m_segm',
'mort_acc_segm',
'open_rv_12m_segm',
'revol_bal_segm',
'emp_length_segm',
'open_il_12m_segm',
'mths_since_last_major_derog_segm',
'Annual_Inc_round_segm',
'dti_rounded_segm',
'mths_since_last_record_segm',
'mths_since_last_delinq_segm',
'delinq_2yrs_segm',
'application_type_segm',
'home_ownership_segm',
'years_with_Credit_line_segm'
]
# concat as to have a single df
train_data = X_train_segmented.copy()
train_data['Defaulted'] = y_train
# Define color palette
color_palette = sns.color_palette("hsv", 10)
# Function to create bar plots for default rate
def plot_default_rate(segment):
default_rates = train_data.groupby(segment)['Defaulted'].mean().reset_index()
default_rates['Defaulted'] = default_rates['Defaulted'] * 100 # Convert to percentage
# Calculate the percentage of the total population for each segment
population_percentage = train_data[segment].value_counts(normalize=True).reset_index()
population_percentage.columns = [segment, 'PopulationPercentage']
population_percentage['PopulationPercentage'] = population_percentage['PopulationPercentage'] * 100 # Convert to percentage
# Merge the default rates with the population percentages
merged_data = pd.merge(default_rates, population_percentage, on=segment)
# Create a color map based on the mapping dictionary
unique_vals = merged_data[segment].astype(str).unique()
mapping = mapping_dict.get(segment, {})
color_map = {str(val): color_palette[idx % len(color_palette)] for idx, val in enumerate(unique_vals)}
plt.figure(figsize=(12, 6))
bar_plot = sns.barplot(x=segment, y='Defaulted', data=merged_data, palette=color_map)
plt.title(f'Default Rate by {segment}')
plt.xlabel(segment)
plt.ylabel('Default Rate (%)')
# Add percentage of the total population as text annotations
for index, row in merged_data.iterrows():
bar_plot.text(index, row['Defaulted'] / 2, f'{row["PopulationPercentage"]:.2f}%', color='black', ha="center", va="center")
# Create legend
handles = [plt.Line2D([0], [0], color=color_map.get(str(v), 'grey'), lw=4, label=f'{k}: {v}') for k, v in mapping.items()]
plt.legend(handles=handles, title='Mappings', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
# Create plots for each segmented variable
for var in segmented_vars:
plot_default_rate(var)
for var in segmented_vars:
print(f"{var}: {X_train_segmented[var].dtype}")
total_pymnt_round_segm: int32 verification_status_segm: int32 mths_since_recent_inq_segm: int32 inq_last_6mths_segm: int32 inq_last_12m_segm: int32 fico_range_low_segm: int64 purpose_segm: int32 instlmnt_round_segm: int32 acc_open_past_24mths_segm: int32 term_segm: int32 open_rv_24m_segm: int32 open_acc_6m_segm: int32 mort_acc_segm: int32 open_rv_12m_segm: int32 revol_bal_segm: int32 emp_length_segm: int32 open_il_12m_segm: int32 mths_since_last_major_derog_segm: int32 Annual_Inc_round_segm: int32 dti_rounded_segm: int32 mths_since_last_record_segm: int32 mths_since_last_delinq_segm: int32 delinq_2yrs_segm: int32 application_type_segm: int32 home_ownership_segm: int32 years_with_Credit_line_segm: int32
from feature_engine.encoding import OneHotEncoder
dummy_encoder = OneHotEncoder(top_categories=None,
variables=segmented_vars, # we select which variables to encode
drop_last=True) # since we have linear model, I want to drop 1 category from each variable. For n categories, n-1
## dummy variables will be created
# wll cast them as categorical, in order to be compatible with the encoder
# Keep the segmented vars only!
X_train_segmented_final = X_train_segmented[segmented_vars]
X_test_segmented_final = X_test_segmented[segmented_vars]
for var in segmented_vars:
X_train_segmented_final[var] = X_train_segmented_final[var].astype('category')
for var in segmented_vars:
X_test_segmented_final[var] = X_test_segmented_final[var].astype('category')
# Fittarw only sto the training set
dummy_encoder.fit(X_train_segmented_final)
OneHotEncoder(drop_last=True,
variables=['total_pymnt_round_segm', 'verification_status_segm',
'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
'inq_last_12m_segm', 'fico_range_low_segm',
'purpose_segm', 'instlmnt_round_segm',
'acc_open_past_24mths_segm', 'term_segm',
'open_rv_24m_segm', 'open_acc_6m_segm',
'mort_acc_segm', 'open_rv_12m_segm', 'revol_bal_segm',
'emp_length_segm', 'open_il_12m_segm',
'mths_since_last_major_derog_segm',
'Annual_Inc_round_segm', 'dti_rounded_segm',
'mths_since_last_record_segm',
'mths_since_last_delinq_segm', 'delinq_2yrs_segm',
'application_type_segm', 'home_ownership_segm',
'years_with_Credit_line_segm'])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. OneHotEncoder(drop_last=True,
variables=['total_pymnt_round_segm', 'verification_status_segm',
'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
'inq_last_12m_segm', 'fico_range_low_segm',
'purpose_segm', 'instlmnt_round_segm',
'acc_open_past_24mths_segm', 'term_segm',
'open_rv_24m_segm', 'open_acc_6m_segm',
'mort_acc_segm', 'open_rv_12m_segm', 'revol_bal_segm',
'emp_length_segm', 'open_il_12m_segm',
'mths_since_last_major_derog_segm',
'Annual_Inc_round_segm', 'dti_rounded_segm',
'mths_since_last_record_segm',
'mths_since_last_delinq_segm', 'delinq_2yrs_segm',
'application_type_segm', 'home_ownership_segm',
'years_with_Credit_line_segm'])X_train_segmented_final
| total_pymnt_round_segm | verification_status_segm | mths_since_recent_inq_segm | inq_last_6mths_segm | inq_last_12m_segm | fico_range_low_segm | purpose_segm | instlmnt_round_segm | acc_open_past_24mths_segm | term_segm | open_rv_24m_segm | open_acc_6m_segm | mort_acc_segm | open_rv_12m_segm | revol_bal_segm | emp_length_segm | open_il_12m_segm | mths_since_last_major_derog_segm | Annual_Inc_round_segm | dti_rounded_segm | mths_since_last_record_segm | mths_since_last_delinq_segm | delinq_2yrs_segm | application_type_segm | home_ownership_segm | years_with_Credit_line_segm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 4 | 2 | 1 | 2 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 2 | -1 | 1 | 3 | 2 | 2 | 2 |
| 1 | 2 | 2 | 2 | 4 | 3 | 2 | 2 | 2 | 3 | 2 | 3 | 2 | 4 | 2 | 3 | 2 | 2 | -1 | 3 | 2 | -1 | 2 | 4 | 2 | 1 | 2 |
| 2 | 1 | 3 | 1 | 2 | 3 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 1 | 2 | 1 | 2 | -1 | 1 | 3 | -1 | 2 | 1 |
| 3 | 2 | 2 | 1 | 3 | 2 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 3 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | -1 | 1 | 3 | 2 | 2 | 2 |
| 4 | 1 | 2 | 2 | 4 | 2 | 4 | 1 | 1 | 2 | 2 | 1 | 1 | 4 | 1 | 1 | 1 | 1 | -1 | 1 | 1 | -1 | 1 | 4 | 2 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 3 | 3 | -1 | 4 | 1 | 1 | 2 | 1 | 2 | 1 | 1 | 1 | 2 | 1 | 2 | -1 | 2 | 1 | 2 | 2 | -1 | 1 | 4 | 2 | 2 | 2 |
| 107860 | 1 | 3 | 3 | 4 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 1 | 4 | 1 | 3 | 1 | 1 | -1 | 1 | 2 | -1 | -1 | 4 | 2 | 2 | 2 |
| 107861 | 1 | 2 | 2 | 4 | 2 | 2 | 2 | 1 | 2 | 2 | 1 | 1 | 3 | 1 | 1 | 1 | 1 | 2 | 1 | 2 | -1 | 2 | 4 | 2 | 2 | 2 |
| 107862 | 2 | 1 | 3 | 4 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 1 | 1 | 1 | 2 | -1 | 1 | 3 | 2 | 2 | 2 |
| 107863 | 3 | 2 | 1 | 3 | 2 | 1 | 1 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | -1 | 1 | 4 | 2 | 1 | 1 |
107864 rows × 26 columns
dummy_encoder.variables_
['total_pymnt_round_segm', 'verification_status_segm', 'mths_since_recent_inq_segm', 'inq_last_6mths_segm', 'inq_last_12m_segm', 'fico_range_low_segm', 'purpose_segm', 'instlmnt_round_segm', 'acc_open_past_24mths_segm', 'term_segm', 'open_rv_24m_segm', 'open_acc_6m_segm', 'mort_acc_segm', 'open_rv_12m_segm', 'revol_bal_segm', 'emp_length_segm', 'open_il_12m_segm', 'mths_since_last_major_derog_segm', 'Annual_Inc_round_segm', 'dti_rounded_segm', 'mths_since_last_record_segm', 'mths_since_last_delinq_segm', 'delinq_2yrs_segm', 'application_type_segm', 'home_ownership_segm', 'years_with_Credit_line_segm']
# transform training and test sets
X_train_encoded = dummy_encoder.transform(X_train_segmented_final)
X_test_encoded = dummy_encoder.transform(X_test_segmented_final)
X_train_encoded
| total_pymnt_round_segm_1 | total_pymnt_round_segm_2 | total_pymnt_round_segm_3 | verification_status_segm_1 | verification_status_segm_2 | mths_since_recent_inq_segm_1 | mths_since_recent_inq_segm_2 | mths_since_recent_inq_segm_-1 | inq_last_6mths_segm_4 | inq_last_6mths_segm_2 | inq_last_6mths_segm_3 | inq_last_12m_segm_2 | inq_last_12m_segm_3 | fico_range_low_segm_1 | fico_range_low_segm_2 | fico_range_low_segm_4 | fico_range_low_segm_5 | purpose_segm_2 | purpose_segm_1 | instlmnt_round_segm_1 | instlmnt_round_segm_2 | instlmnt_round_segm_3 | acc_open_past_24mths_segm_2 | acc_open_past_24mths_segm_3 | term_segm_2 | open_rv_24m_segm_2 | open_rv_24m_segm_3 | open_acc_6m_segm_1 | mort_acc_segm_1 | mort_acc_segm_4 | mort_acc_segm_3 | open_rv_12m_segm_1 | revol_bal_segm_1 | revol_bal_segm_3 | revol_bal_segm_2 | emp_length_segm_2 | emp_length_segm_1 | open_il_12m_segm_1 | mths_since_last_major_derog_segm_1 | mths_since_last_major_derog_segm_-1 | mths_since_last_major_derog_segm_2 | Annual_Inc_round_segm_1 | Annual_Inc_round_segm_3 | Annual_Inc_round_segm_2 | dti_rounded_segm_2 | dti_rounded_segm_1 | mths_since_last_record_segm_-1 | mths_since_last_record_segm_2 | mths_since_last_delinq_segm_1 | mths_since_last_delinq_segm_2 | delinq_2yrs_segm_3 | delinq_2yrs_segm_4 | delinq_2yrs_segm_1 | application_type_segm_2 | home_ownership_segm_2 | years_with_Credit_line_segm_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 4 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107860 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107861 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107862 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 107863 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
107864 rows × 56 columns
from sklearn.linear_model import LogisticRegression
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
import pandas as pd
# initialize logistic regression model
log_reg = LogisticRegression(random_state=100)
# Logistic Regression - Forward stepwise selection
sfs_forward = SFS(log_reg,
k_features='best',
forward=True,
floating=False,
scoring='roc_auc',
cv=5)
# Fit the SFS to the training data
sfs_log_reg_fitted = sfs_forward.fit(X_train_encoded, y_train)
# Create a DataFrame to store the results
sfs_log_reg_pdf = pd.DataFrame(sfs_log_reg_fitted.subsets_).T
sfs_log_reg_pdf
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (0,) | [0.6439882397054446, 0.626075905668313, 0.6218... | 0.631047 | (total_pymnt_round_segm_1,) |
| 2 | (0, 19) | [0.7151702714308072, 0.6883516520327987, 0.685... | 0.693707 | (total_pymnt_round_segm_1, instlmnt_round_segm_1) |
| 3 | (0, 1, 19) | [0.7074235829754714, 0.704404450915725, 0.7225... | 0.719208 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 4 | (0, 1, 19, 20) | [0.7812859182667049, 0.7654186900552782, 0.777... | 0.775343 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 5 | (0, 1, 19, 20, 24) | [0.792806372720185, 0.7797363148019998, 0.7895... | 0.788324 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 6 | (0, 1, 8, 19, 20, 24) | [0.7972875363010188, 0.7911015291335035, 0.799... | 0.797291 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 7 | (0, 1, 8, 13, 19, 20, 24) | [0.8012594738989005, 0.7970304717242864, 0.802... | 0.802254 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 8 | (0, 1, 3, 8, 13, 19, 20, 24) | [0.8056326610862748, 0.8001170012743734, 0.807... | 0.80611 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 9 | (0, 1, 3, 8, 13, 19, 20, 21, 24) | [0.8104463668898177, 0.8057966379562842, 0.811... | 0.810134 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 10 | (0, 1, 3, 8, 13, 19, 20, 21, 24, 28) | [0.812859827302234, 0.808408212192469, 0.81275... | 0.812424 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 11 | (0, 1, 2, 3, 8, 13, 19, 20, 21, 24, 28) | [0.8123589994832174, 0.8083858261967757, 0.815... | 0.814345 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 12 | (0, 1, 2, 3, 8, 13, 19, 20, 21, 24, 28, 53) | [0.8154586382368225, 0.8107458756084481, 0.817... | 0.816312 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 13 | (0, 1, 2, 3, 4, 8, 13, 19, 20, 21, 24, 28, 53) | [0.81613084948234, 0.8132797877972238, 0.81846... | 0.817912 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 14 | (0, 1, 2, 3, 4, 8, 13, 19, 20, 21, 23, 24, 28,... | [0.8181919630466461, 0.8149916991081677, 0.819... | 0.819402 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 15 | (0, 1, 2, 3, 4, 8, 13, 14, 19, 20, 21, 23, 24,... | [0.8192254027805981, 0.8163605374299574, 0.820... | 0.820762 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 16 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... | [0.8201646541514327, 0.8159830891428601, 0.821... | 0.821901 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 17 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... | [0.8208574220967332, 0.8169434143585887, 0.822... | 0.822556 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 18 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... | [0.8212709555678036, 0.8177132528243932, 0.823... | 0.82323 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 19 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 19, 20, 21, 23, ... | [0.8241231797561632, 0.8196227549942066, 0.826... | 0.82481 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 20 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 17, 19, 20, 21, ... | [0.8242745078658027, 0.8201652619737698, 0.827... | 0.825401 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 21 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 17, 18, 19, 20, ... | [0.8249949234979195, 0.8217764631458153, 0.829... | 0.826552 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 22 | (0, 1, 2, 3, 4, 5, 8, 13, 14, 16, 17, 18, 19, ... | [0.8256054825506209, 0.8219440628545791, 0.830... | 0.827039 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 23 | (0, 1, 2, 3, 4, 5, 8, 10, 13, 14, 16, 17, 18, ... | [0.8261880715989136, 0.8217945007826329, 0.830... | 0.827479 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 24 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 16, 17, 1... | [0.8267210595510402, 0.8211445195503647, 0.832... | 0.827882 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 25 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.8279712041462934, 0.8219215158085571, 0.831... | 0.828335 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 26 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.8280176715992001, 0.8225550878017761, 0.832... | 0.828645 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 27 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.8286929806749115, 0.8227898633921007, 0.832... | 0.828678 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 28 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.8287795766680741, 0.8234944406858081, 0.832... | 0.82876 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 29 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.828717512625001, 0.8236664961203333, 0.8322... | 0.828843 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 30 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 1... | [0.8284529436012258, 0.8242021495136853, 0.832... | 0.828895 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 31 | (0, 1, 2, 3, 4, 5, 7, 8, 10, 12, 13, 14, 15, 1... | [0.8286129921923936, 0.824072289265287, 0.8330... | 0.829011 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 32 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... | [0.8284227173736703, 0.8241185464985941, 0.833... | 0.829022 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 33 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... | [0.8286569706372254, 0.8241361725623494, 0.833... | 0.829081 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 34 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... | [0.8288580789083605, 0.8238451009348936, 0.832... | 0.829077 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 35 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15... | [0.8290385588535808, 0.8241533154751186, 0.832... | 0.829189 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 36 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.828690258881909, 0.8241913949306223, 0.8324... | 0.829131 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 37 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8287036171554611, 0.8241492892169003, 0.832... | 0.829164 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 38 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8288726011065521, 0.824220741879413, 0.8329... | 0.829187 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 39 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.828912299890018, 0.8241475355577652, 0.8328... | 0.829199 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 40 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8288817871579359, 0.8245380468159704, 0.832... | 0.829251 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 41 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8284581544023032, 0.8248537949326838, 0.832... | 0.829208 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 42 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8287304769548294, 0.8243444285318767, 0.832... | 0.829149 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 43 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8286825948858225, 0.8247075433397171, 0.832... | 0.829059 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 44 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8285633194700956, 0.8242707569537238, 0.832... | 0.829037 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 45 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.828550408859866, 0.8242014158399654, 0.8328... | 0.828942 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 46 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8289079127894546, 0.8241442071843049, 0.832... | 0.829038 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 47 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8286533535175773, 0.8242787915756792, 0.832... | 0.828998 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 48 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8282594456065784, 0.8244554995752567, 0.832... | 0.828971 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 49 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8283608682089921, 0.8245476561522515, 0.832... | 0.828916 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 50 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8282044367374727, 0.8243196088867714, 0.832... | 0.82886 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 51 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8284016951040315, 0.8242619707635674, 0.832... | 0.828829 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 52 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8284335329195492, 0.8240394349982263, 0.832... | 0.828781 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 53 | (0, 1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8281119674015149, 0.8239536130674949, 0.832... | 0.828654 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 54 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8281095500195719, 0.823945793179311, 0.8328... | 0.828561 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 55 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8279925129204588, 0.8239311197049157, 0.832... | 0.828534 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
| 56 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8278504783014007, 0.8238715668722479, 0.832... | 0.828366 | (total_pymnt_round_segm_1, total_pymnt_round_s... |
This is occurred the combination in row 27
sfs_log_reg_pdf['feature_idx'][27]
(0, 1, 2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 35, 36, 42, 49, 53)
X_train_encoded
| total_pymnt_round_segm_1 | total_pymnt_round_segm_2 | total_pymnt_round_segm_3 | verification_status_segm_1 | verification_status_segm_2 | mths_since_recent_inq_segm_1 | mths_since_recent_inq_segm_2 | mths_since_recent_inq_segm_-1 | inq_last_6mths_segm_4 | inq_last_6mths_segm_2 | inq_last_6mths_segm_3 | inq_last_12m_segm_2 | inq_last_12m_segm_3 | fico_range_low_segm_1 | fico_range_low_segm_2 | fico_range_low_segm_4 | fico_range_low_segm_5 | purpose_segm_2 | purpose_segm_1 | instlmnt_round_segm_1 | instlmnt_round_segm_2 | instlmnt_round_segm_3 | acc_open_past_24mths_segm_2 | acc_open_past_24mths_segm_3 | term_segm_2 | open_rv_24m_segm_2 | open_rv_24m_segm_3 | open_acc_6m_segm_1 | mort_acc_segm_1 | mort_acc_segm_4 | mort_acc_segm_3 | open_rv_12m_segm_1 | revol_bal_segm_1 | revol_bal_segm_3 | revol_bal_segm_2 | emp_length_segm_2 | emp_length_segm_1 | open_il_12m_segm_1 | mths_since_last_major_derog_segm_1 | mths_since_last_major_derog_segm_-1 | mths_since_last_major_derog_segm_2 | Annual_Inc_round_segm_1 | Annual_Inc_round_segm_3 | Annual_Inc_round_segm_2 | dti_rounded_segm_2 | dti_rounded_segm_1 | mths_since_last_record_segm_-1 | mths_since_last_record_segm_2 | mths_since_last_delinq_segm_1 | mths_since_last_delinq_segm_2 | delinq_2yrs_segm_3 | delinq_2yrs_segm_4 | delinq_2yrs_segm_1 | application_type_segm_2 | home_ownership_segm_2 | years_with_Credit_line_segm_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 |
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
| 3 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 4 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107860 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107861 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 1 | 1 |
| 107862 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | 1 |
| 107863 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
107864 rows × 56 columns
X_train_log_reg = X_train_encoded.iloc[:,[0, 1,
2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 35, 36, 42, 49, 53]]
X_test_log_reg = X_test_encoded.iloc[:, [0, 1,
2, 3, 4, 5, 7, 8, 10, 13, 14, 15, 16, 17, 18, 19, 20, 21, 23, 24, 27, 28, 35, 36, 42, 49, 53]]
X_train_log_reg
| total_pymnt_round_segm_1 | total_pymnt_round_segm_2 | total_pymnt_round_segm_3 | verification_status_segm_1 | verification_status_segm_2 | mths_since_recent_inq_segm_1 | mths_since_recent_inq_segm_-1 | inq_last_6mths_segm_4 | inq_last_6mths_segm_3 | fico_range_low_segm_1 | fico_range_low_segm_2 | fico_range_low_segm_4 | fico_range_low_segm_5 | purpose_segm_2 | purpose_segm_1 | instlmnt_round_segm_1 | instlmnt_round_segm_2 | instlmnt_round_segm_3 | acc_open_past_24mths_segm_3 | term_segm_2 | open_acc_6m_segm_1 | mort_acc_segm_1 | emp_length_segm_2 | emp_length_segm_1 | Annual_Inc_round_segm_3 | mths_since_last_delinq_segm_2 | application_type_segm_2 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 |
| 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 | 1 |
| 2 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 |
| 3 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 4 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 |
| 107860 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 107861 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 1 |
| 107862 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 107863 | 0 | 0 | 1 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 1 | 1 | 0 | 0 | 0 | 1 |
107864 rows × 27 columns
log_reg.fit(X_train_log_reg, y_train)
LogisticRegression(random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(random_state=100)
calculate_DiscriminatoryStats(X_train_log_reg, y_train, log_reg, 'TRAINING')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 171 1 0.0 1.0 1.0 0.0
1 178 1 0.0 1.0 1.0 0.0
2 179 2 0.0 2.0 2.0 0.0
3 181 1 0.0 1.0 1.0 0.0
4 183 2 0.0 2.0 2.0 0.0
.. ... ... ... ... ... ...
595 820 2 2.0 0.0 2.0 100940.0
596 821 2 2.0 0.0 2.0 100942.0
597 824 2 2.0 0.0 2.0 100944.0
598 829 1 1.0 0.0 1.0 100945.0
599 832 1 1.0 0.0 1.0 100946.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 1.0 0.0 0.01 0.00
1 2.0 0.0 0.01 0.00
2 4.0 0.0 0.03 0.00
3 5.0 0.0 0.01 0.00
4 7.0 0.0 0.03 0.00
.. ... ... ... ...
595 6918.0 0.0 0.00 99.99
596 6918.0 0.0 0.00 100.00
597 6918.0 0.0 0.00 100.00
598 6918.0 0.0 0.00 100.00
599 6918.0 0.0 0.00 100.00
cum_perc_bads Separation
0 0.01 -0.01
1 0.03 -0.03
2 0.06 -0.06
3 0.07 -0.07
4 0.10 -0.10
.. ... ...
595 100.00 -0.01
596 100.00 0.00
597 100.00 0.00
598 100.00 0.00
599 100.00 0.00
[600 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAINING data is: 46.86 AUC metric on the TRAINING data is: 0.83 Gini metric on the TRAINING data is: 0.66
calculate_DiscriminatoryStats(X_test_log_reg, y_test, log_reg, 'TEST')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 167 1 0.0 1.0 1.0 0.0
1 175 1 0.0 1.0 1.0 0.0
2 182 1 0.0 1.0 1.0 0.0
3 183 2 0.0 2.0 2.0 0.0
4 184 1 0.0 1.0 1.0 0.0
.. ... ... ... ... ... ...
570 813 1 1.0 0.0 1.0 40545.0
571 814 3 3.0 0.0 3.0 40548.0
572 815 1 1.0 0.0 1.0 40549.0
573 818 1 1.0 0.0 1.0 40550.0
574 824 1 1.0 0.0 1.0 40551.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 1.0 0.00 0.04 0.00
1 2.0 0.00 0.04 0.00
2 3.0 0.00 0.04 0.00
3 5.0 0.00 0.08 0.00
4 6.0 0.00 0.04 0.00
.. ... ... ... ...
570 2377.0 0.00 0.00 99.99
571 2377.0 0.01 0.00 99.99
572 2377.0 0.00 0.00 100.00
573 2377.0 0.00 0.00 100.00
574 2377.0 0.00 0.00 100.00
cum_perc_bads Separation
0 0.04 -0.04
1 0.08 -0.08
2 0.13 -0.13
3 0.21 -0.21
4 0.25 -0.25
.. ... ...
570 100.00 -0.01
571 100.00 -0.01
572 100.00 0.00
573 100.00 0.00
574 100.00 0.00
[575 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 43.40 AUC metric on the TEST data is: 0.81 Gini metric on the TEST data is: 0.63
calculate_and_plot_psi(X_train_log_reg, X_test_log_reg, log_reg, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.166 Moderate shift in the population (PSI = 0.166)
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_log_reg, X_test_log_reg, y_train, y_test)
iv_comparison_df
| Variable | IV_Train | IV_Test | |
|---|---|---|---|
| 0 | total_pymnt_round_segm_1 | 28.43 | 33.87 |
| 2 | total_pymnt_round_segm_3 | 11.75 | 21.32 |
| 7 | inq_last_6mths_segm_4 | 5.97 | 8.43 |
| 5 | mths_since_recent_inq_segm_1 | 5.91 | 7.37 |
| 3 | verification_status_segm_1 | 5.11 | 4.83 |
| 19 | term_segm_2 | 4.77 | 2.90 |
| 9 | fico_range_low_segm_1 | 3.38 | 3.89 |
| 11 | fico_range_low_segm_4 | 3.19 | 2.70 |
| 21 | mort_acc_segm_1 | 2.77 | 3.32 |
| 15 | instlmnt_round_segm_1 | 2.69 | 2.55 |
| 20 | open_acc_6m_segm_1 | 2.63 | 3.67 |
| 18 | acc_open_past_24mths_segm_3 | 2.23 | 3.59 |
| 6 | mths_since_recent_inq_segm_-1 | 2.18 | 2.89 |
| 1 | total_pymnt_round_segm_2 | 1.87 | 7.45 |
| 12 | fico_range_low_segm_5 | 1.29 | 1.08 |
| 23 | emp_length_segm_1 | 1.28 | 1.33 |
| 8 | inq_last_6mths_segm_3 | 1.27 | 2.07 |
| 24 | Annual_Inc_round_segm_3 | 1.07 | 1.37 |
| 13 | purpose_segm_2 | 0.81 | 1.11 |
| 14 | purpose_segm_1 | 0.79 | 0.66 |
| 17 | instlmnt_round_segm_3 | 0.49 | 0.84 |
| 10 | fico_range_low_segm_2 | 0.39 | 0.21 |
| 26 | application_type_segm_2 | 0.37 | 0.49 |
| 16 | instlmnt_round_segm_2 | 0.29 | 0.46 |
| 4 | verification_status_segm_2 | 0.06 | 0.18 |
| 22 | emp_length_segm_2 | 0.03 | 0.05 |
| 25 | mths_since_last_delinq_segm_2 | 0.03 | 0.01 |
Random Forest does not need segmented variables -it is a decision tree model and can handle categorical data exeptionally well
# initialize the Randomg forest classifier
Random_forest_clf = RandomForestClassifier(n_estimators=50, random_state=100,
max_depth=7,
max_features=7,
min_samples_leaf=50,
min_samples_split=50,
bootstrap=True,
criterion="entropy")
# Logistic Regression - Forward stepwise selection
sfs_forward = SFS(Random_forest_clf,
k_features='best',
forward=True,
floating=False,
scoring='roc_auc',
cv=5)
# Fit the SFS to the training data
sfs_RandomForest_fitted = sfs_forward.fit(X_train_segmented_final, y_train)
# Create a DataFrame to store the results
sfs_RandomForest_pdf = pd.DataFrame(sfs_RandomForest_fitted.subsets_).T
sfs_RandomForest_pdf
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (0,) | [0.6623325873471722, 0.6501395268470183, 0.650... | 0.658824 | (total_pymnt_round_segm,) |
| 2 | (0, 7) | [0.78496585403239, 0.7685981098560797, 0.78386... | 0.780542 | (total_pymnt_round_segm, instlmnt_round_segm) |
| 3 | (0, 1, 7) | [0.7961419479514388, 0.7896147124986794, 0.801... | 0.797495 | (total_pymnt_round_segm, verification_status_s... |
| 4 | (0, 1, 7, 9) | [0.80373696807301, 0.7981437052799026, 0.81151... | 0.805966 | (total_pymnt_round_segm, verification_status_s... |
| 5 | (0, 1, 5, 7, 9) | [0.8135796341122318, 0.8062696069827986, 0.815... | 0.813918 | (total_pymnt_round_segm, verification_status_s... |
| 6 | (0, 1, 3, 5, 7, 9) | [0.8148312291366508, 0.8100804693335838, 0.820... | 0.817563 | (total_pymnt_round_segm, verification_status_s... |
| 7 | (0, 1, 3, 5, 7, 9, 14) | [0.8139364934066355, 0.8137097742804486, 0.823... | 0.819501 | (total_pymnt_round_segm, verification_status_s... |
| 8 | (0, 1, 3, 5, 7, 9, 14, 15) | [0.8159134999858539, 0.8146649816746199, 0.825... | 0.820944 | (total_pymnt_round_segm, verification_status_s... |
| 9 | (0, 1, 3, 5, 7, 9, 14, 15, 23) | [0.8156785841805809, 0.8168742342951593, 0.826... | 0.822179 | (total_pymnt_round_segm, verification_status_s... |
| 10 | (0, 1, 2, 3, 5, 7, 9, 14, 15, 23) | [0.8183933757288406, 0.8170633252755679, 0.827... | 0.823524 | (total_pymnt_round_segm, verification_status_s... |
| 11 | (0, 1, 2, 3, 5, 7, 9, 14, 15, 23, 24) | [0.8198957517466927, 0.8170878765034586, 0.827... | 0.82384 | (total_pymnt_round_segm, verification_status_s... |
| 12 | (0, 1, 2, 3, 5, 7, 9, 12, 14, 15, 23, 24) | [0.8207472790127914, 0.8184729988029307, 0.827... | 0.824214 | (total_pymnt_round_segm, verification_status_s... |
| 13 | (0, 1, 2, 3, 5, 7, 9, 12, 14, 15, 19, 23, 24) | [0.8211393425509002, 0.8199693710905929, 0.827... | 0.824896 | (total_pymnt_round_segm, verification_status_s... |
| 14 | (0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 23, 24) | [0.8229057324900073, 0.8185682332306523, 0.827... | 0.824727 | (total_pymnt_round_segm, verification_status_s... |
| 15 | (0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, ... | [0.8221078890095789, 0.8203541203259257, 0.827... | 0.825102 | (total_pymnt_round_segm, verification_status_s... |
| 16 | (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 14, 15, 19, 2... | [0.8243333487329516, 0.8196154540459708, 0.826... | 0.824801 | (total_pymnt_round_segm, verification_status_s... |
| 17 | (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1... | [0.8244440290128809, 0.821399676954514, 0.8269... | 0.825399 | (total_pymnt_round_segm, verification_status_s... |
| 18 | (0, 1, 2, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 1... | [0.8242124617292625, 0.8232964024649146, 0.827... | 0.825939 | (total_pymnt_round_segm, verification_status_s... |
| 19 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... | [0.8236040514605106, 0.8217669790709015, 0.826... | 0.825206 | (total_pymnt_round_segm, verification_status_s... |
| 20 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8248495940768055, 0.8211275734768861, 0.826... | 0.825175 | (total_pymnt_round_segm, verification_status_s... |
| 21 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8256272031750432, 0.8226433612764005, 0.828... | 0.825577 | (total_pymnt_round_segm, verification_status_s... |
| 22 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8235568498397545, 0.822390118581715, 0.8245... | 0.824789 | (total_pymnt_round_segm, verification_status_s... |
| 23 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8261224978753898, 0.8212807502339882, 0.825... | 0.824938 | (total_pymnt_round_segm, verification_status_s... |
| 24 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8250672658908841, 0.82103207063195, 0.82521... | 0.824195 | (total_pymnt_round_segm, verification_status_s... |
| 25 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14,... | [0.8248818258360471, 0.8219798160275569, 0.824... | 0.82413 | (total_pymnt_round_segm, verification_status_s... |
| 26 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8224554727199342, 0.8205941032102126, 0.824... | 0.822521 | (total_pymnt_round_segm, verification_status_s... |
# Again, choose to stop where the performance stops improving
sfs_RandomForest_pdf['feature_idx'][15]
(0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24)
X_train_RandomForest = X_train_segmented_final.iloc[:,[0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24]]
X_test_RandomForest = X_test_segmented_final.iloc[:, [0, 1, 2, 3, 5, 7, 9, 11, 12, 14, 15, 19, 22, 23, 24]]
X_train_RandomForest
| total_pymnt_round_segm | verification_status_segm | mths_since_recent_inq_segm | inq_last_6mths_segm | fico_range_low_segm | instlmnt_round_segm | term_segm | open_acc_6m_segm | mort_acc_segm | revol_bal_segm | emp_length_segm | dti_rounded_segm | delinq_2yrs_segm | application_type_segm | home_ownership_segm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 4 | 1 | 1 | 2 | 1 | 1 | 1 | 2 | 2 | 3 | 2 | 2 |
| 1 | 2 | 2 | 2 | 4 | 2 | 2 | 2 | 2 | 4 | 3 | 2 | 2 | 4 | 2 | 1 |
| 2 | 1 | 3 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 3 | -1 | 2 |
| 3 | 2 | 2 | 1 | 3 | 1 | 2 | 1 | 1 | 3 | 1 | 1 | 2 | 3 | 2 | 2 |
| 4 | 1 | 2 | 2 | 4 | 4 | 1 | 2 | 1 | 4 | 1 | 1 | 1 | 4 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 3 | 3 | -1 | 4 | 1 | 1 | 1 | 1 | 2 | 2 | -1 | 2 | 4 | 2 | 2 |
| 107860 | 1 | 3 | 3 | 4 | 2 | 1 | 2 | 1 | 4 | 3 | 1 | 2 | 4 | 2 | 2 |
| 107861 | 1 | 2 | 2 | 4 | 2 | 1 | 2 | 1 | 3 | 1 | 1 | 2 | 4 | 2 | 2 |
| 107862 | 2 | 1 | 3 | 4 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 3 | 2 | 2 |
| 107863 | 3 | 2 | 1 | 3 | 1 | 1 | 2 | 1 | 1 | 1 | 2 | 1 | 4 | 2 | 1 |
107864 rows × 15 columns
from skopt import BayesSearchCV
from sklearn.model_selection import KFold, cross_validate
import numpy as np
import matplotlib.pyplot as plt
# Define the parameter space for Bayesian optimization
hyperparameter_space = {
'n_estimators': (50, 200),
'max_depth': (6, 9),
'max_features': (6, 9),
'criterion': ['gini','entropy'],
'min_samples_leaf': (70, 150),
'min_samples_split': (70, 150)
}
# Setup BayesSearchCV
opt = BayesSearchCV(
estimator=RandomForestClassifier(random_state=100),
search_spaces=hyperparameter_space,
scoring='roc_auc',
n_iter=20,
cv=5,
return_train_score=True,
random_state=100
)
# Fit the BayesSearchCV
opt.fit(X_train_RandomForest, y_train)
# Best estimator found by BayesSearchCV
best_rfc = opt.best_estimator_
# Use KFold for the cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=100)
# Perform cross-validation using 'roc_auc' metric
results = cross_validate(best_rfc, X_train_RandomForest, y_train, cv=kf, scoring='roc_auc')
# Optionally plot the AUC results
plt.errorbar(range(len(results['test_score'])), results['test_score'], fmt='-o')
plt.ylabel('AUC Score')
plt.xlabel('Fold Number')
plt.title('Random Forest AUC Performance across Folds')
plt.show()
best_rfc
RandomForestClassifier(max_depth=9, max_features=9, min_samples_leaf=70,
min_samples_split=134, n_estimators=198,
random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(max_depth=9, max_features=9, min_samples_leaf=70,
min_samples_split=134, n_estimators=198,
random_state=100)# initialize the Randomg forest classifier
Random_forest_clf = RandomForestClassifier(n_estimators=198, n_jobs=-1, random_state=100,
max_depth=9,
max_features=9,
min_samples_leaf=70,
min_samples_split=134,
bootstrap=True,
criterion="entropy")
Random_forest_clf.fit(X_train_RandomForest, y_train)
RandomForestClassifier(criterion='entropy', max_depth=9, max_features=9,
min_samples_leaf=70, min_samples_split=134,
n_estimators=198, n_jobs=-1, random_state=100)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. RandomForestClassifier(criterion='entropy', max_depth=9, max_features=9,
min_samples_leaf=70, min_samples_split=134,
n_estimators=198, n_jobs=-1, random_state=100)calculate_DiscriminatoryStats(X_train_RandomForest, y_train, Random_forest_clf, 'TRAIN')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 268 12 0.0 12.0 12.0 0.0
1 269 2 0.0 2.0 2.0 0.0
2 270 7 0.0 7.0 7.0 0.0
3 271 1 0.0 1.0 1.0 0.0
4 275 7 0.0 7.0 7.0 0.0
.. ... ... ... ... ... ...
375 685 1 1.0 0.0 1.0 100936.0
376 686 2 2.0 0.0 2.0 100938.0
377 695 4 4.0 0.0 4.0 100942.0
378 696 3 3.0 0.0 3.0 100945.0
379 699 1 1.0 0.0 1.0 100946.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 12.0 0.0 0.17 0.00
1 14.0 0.0 0.03 0.00
2 21.0 0.0 0.10 0.00
3 22.0 0.0 0.01 0.00
4 29.0 0.0 0.10 0.00
.. ... ... ... ...
375 6918.0 0.0 0.00 99.99
376 6918.0 0.0 0.00 99.99
377 6918.0 0.0 0.00 100.00
378 6918.0 0.0 0.00 100.00
379 6918.0 0.0 0.00 100.00
cum_perc_bads Separation
0 0.17 -0.17
1 0.20 -0.20
2 0.30 -0.30
3 0.32 -0.32
4 0.42 -0.42
.. ... ...
375 100.00 -0.01
376 100.00 -0.01
377 100.00 0.00
378 100.00 0.00
379 100.00 0.00
[380 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAIN data is: 48.74 AUC metric on the TRAIN data is: 0.84 Gini metric on the TRAIN data is: 0.68
calculate_DiscriminatoryStats(X_test_RandomForest, y_test, Random_forest_clf, 'TEST')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 266 1 0.0 1.0 1.0 0.0
1 267 1 0.0 1.0 1.0 0.0
2 268 3 1.0 2.0 3.0 1.0
3 270 2 2.0 0.0 2.0 3.0
4 275 6 1.0 5.0 6.0 4.0
.. ... ... ... ... ... ...
368 683 2 2.0 0.0 2.0 40544.0
369 684 3 3.0 0.0 3.0 40547.0
370 686 2 2.0 0.0 2.0 40549.0
371 687 1 1.0 0.0 1.0 40550.0
372 695 1 1.0 0.0 1.0 40551.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 1.0 0.00 0.04 0.00
1 2.0 0.00 0.04 0.00
2 4.0 0.00 0.08 0.00
3 4.0 0.00 0.00 0.01
4 9.0 0.00 0.21 0.01
.. ... ... ... ...
368 2377.0 0.00 0.00 99.98
369 2377.0 0.01 0.00 99.99
370 2377.0 0.00 0.00 100.00
371 2377.0 0.00 0.00 100.00
372 2377.0 0.00 0.00 100.00
cum_perc_bads Separation
0 0.04 -0.04
1 0.08 -0.08
2 0.17 -0.17
3 0.17 -0.16
4 0.38 -0.37
.. ... ...
368 100.00 -0.02
369 100.00 -0.01
370 100.00 0.00
371 100.00 0.00
372 100.00 0.00
[373 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 42.39 AUC metric on the TEST data is: 0.81 Gini metric on the TEST data is: 0.62
calculate_and_plot_psi(X_train_RandomForest, X_test_RandomForest, Random_forest_clf, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.150 Moderate shift in the population (PSI = 0.150)
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_RandomForest, X_test_RandomForest, y_train, y_test)
iv_comparison_df
| Variable | IV_Train | IV_Test | |
|---|---|---|---|
| 0 | total_pymnt_round_segm | 55.61 | 55.74 |
| 2 | mths_since_recent_inq_segm | 7.76 | 10.87 |
| 1 | verification_status_segm | 7.54 | 7.90 |
| 3 | inq_last_6mths_segm | 7.34 | 9.99 |
| 4 | fico_range_low_segm | 6.50 | 6.33 |
| 6 | term_segm | 4.77 | 2.90 |
| 5 | instlmnt_round_segm | 3.04 | 2.70 |
| 8 | mort_acc_segm | 2.97 | 3.75 |
| 10 | emp_length_segm | 2.74 | 2.83 |
| 7 | open_acc_6m_segm | 2.63 | 3.67 |
| 9 | revol_bal_segm | 1.64 | 2.66 |
| 12 | delinq_2yrs_segm | 0.50 | 0.10 |
| 11 | dti_rounded_segm | 0.37 | 0.76 |
| 13 | application_type_segm | 0.37 | 0.49 |
| 14 | home_ownership_segm | 0.06 | 0.13 |
Gradient Boosting does not need segmented variables -it can handle categorical data exeptionally well
# initialize the Randomg forest classifier
GradientBoosting_clf = GradientBoostingClassifier(n_estimators=50, random_state=100,
max_depth = 7,
min_samples_split = 50,
min_samples_leaf = 50
)
# GradientBoosting - Forward selection
sfs_forward = SFS(GradientBoosting_clf,
k_features='best',
forward=True,
floating=False,
scoring='roc_auc',
cv=5)
# Fit the SFS to the training data
sfs_GB_fitted = sfs_forward.fit(X_train_segmented_final, y_train)
# Create a DataFrame to store the results
sfs_GB_pdf = pd.DataFrame(sfs_GB_fitted.subsets_).T
sfs_GB_pdf
| feature_idx | cv_scores | avg_score | feature_names | |
|---|---|---|---|---|
| 1 | (0,) | [0.6623325873471722, 0.6501395268470183, 0.650... | 0.658824 | (total_pymnt_round_segm,) |
| 2 | (0, 7) | [0.7849806448285755, 0.7685981098560798, 0.783... | 0.780537 | (total_pymnt_round_segm, instlmnt_round_segm) |
| 3 | (0, 1, 7) | [0.7960789706751874, 0.7896001642856509, 0.801... | 0.797504 | (total_pymnt_round_segm, verification_status_s... |
| 4 | (0, 1, 7, 9) | [0.8036432810928142, 0.7984550513542973, 0.811... | 0.806297 | (total_pymnt_round_segm, verification_status_s... |
| 5 | (0, 1, 5, 7, 9) | [0.81259010119698, 0.805397125774151, 0.815951... | 0.813734 | (total_pymnt_round_segm, verification_status_s... |
| 6 | (0, 1, 3, 5, 7, 9) | [0.8157633895204522, 0.8087837815590645, 0.819... | 0.817315 | (total_pymnt_round_segm, verification_status_s... |
| 7 | (0, 1, 3, 5, 7, 9, 14) | [0.818077379142542, 0.8158363722933882, 0.8232... | 0.820773 | (total_pymnt_round_segm, verification_status_s... |
| 8 | (0, 1, 3, 5, 7, 9, 14, 23) | [0.8208706192114893, 0.8193732164570817, 0.825... | 0.823546 | (total_pymnt_round_segm, verification_status_s... |
| 9 | (0, 1, 3, 5, 7, 9, 14, 15, 23) | [0.8252111269762993, 0.82227545074766, 0.82740... | 0.82551 | (total_pymnt_round_segm, verification_status_s... |
| 10 | (0, 1, 3, 5, 6, 7, 9, 14, 15, 23) | [0.8253856619525928, 0.8221907203802679, 0.831... | 0.826683 | (total_pymnt_round_segm, verification_status_s... |
| 11 | (0, 1, 3, 5, 6, 7, 9, 12, 14, 15, 23) | [0.8264928586956094, 0.824087750096845, 0.8318... | 0.82782 | (total_pymnt_round_segm, verification_status_s... |
| 12 | (0, 1, 3, 5, 6, 7, 9, 12, 13, 14, 15, 23) | [0.8274306238242124, 0.8259570970513617, 0.833... | 0.82924 | (total_pymnt_round_segm, verification_status_s... |
| 13 | (0, 1, 2, 3, 5, 6, 7, 9, 12, 13, 14, 15, 23) | [0.8288338334627975, 0.8229335918632508, 0.835... | 0.829392 | (total_pymnt_round_segm, verification_status_s... |
| 14 | (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23) | [0.827908638720299, 0.8246504778399043, 0.8366... | 0.830019 | (total_pymnt_round_segm, verification_status_s... |
| 15 | (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 2... | [0.829238198789017, 0.8251433097402951, 0.8371... | 0.830789 | (total_pymnt_round_segm, verification_status_s... |
| 16 | (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 2... | [0.8305616169169462, 0.8241356894113632, 0.836... | 0.830637 | (total_pymnt_round_segm, verification_status_s... |
| 17 | (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 1... | [0.8307545777156063, 0.8248524349521302, 0.837... | 0.830818 | (total_pymnt_round_segm, verification_status_s... |
| 18 | (0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 1... | [0.8308250578291482, 0.824376907014837, 0.8370... | 0.830737 | (total_pymnt_round_segm, verification_status_s... |
| 19 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... | [0.8297277454922989, 0.825593534881497, 0.8364... | 0.83092 | (total_pymnt_round_segm, verification_status_s... |
| 20 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15... | [0.8301681208562044, 0.824090720580686, 0.8356... | 0.830367 | (total_pymnt_round_segm, verification_status_s... |
| 21 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8303448583360461, 0.8251569453347942, 0.836... | 0.830744 | (total_pymnt_round_segm, verification_status_s... |
| 22 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | [0.83066303951936, 0.8255756225060463, 0.83691... | 0.830938 | (total_pymnt_round_segm, verification_status_s... |
| 23 | (0, 1, 2, 3, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14... | [0.8299909715261057, 0.8250851526771432, 0.835... | 0.830453 | (total_pymnt_round_segm, verification_status_s... |
| 24 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.83006687731912, 0.8249857846243176, 0.83611... | 0.83042 | (total_pymnt_round_segm, verification_status_s... |
| 25 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8316857353335647, 0.8256496340793376, 0.835... | 0.830279 | (total_pymnt_round_segm, verification_status_s... |
| 26 | (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... | [0.8308314862744635, 0.8267472815420289, 0.834... | 0.830266 | (total_pymnt_round_segm, verification_status_s... |
# Again, choose to stop where the performance stops improving
sfs_GB_pdf['feature_idx'][15]
(0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24)
X_train_GB = X_train_segmented_final.iloc[:,[0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24]]
X_test_GB = X_test_segmented_final.iloc[:, [0, 1, 2, 3, 5, 6, 7, 9, 11, 12, 13, 14, 15, 23, 24]]
X_train_GB
| total_pymnt_round_segm | verification_status_segm | mths_since_recent_inq_segm | inq_last_6mths_segm | fico_range_low_segm | purpose_segm | instlmnt_round_segm | term_segm | open_acc_6m_segm | mort_acc_segm | open_rv_12m_segm | revol_bal_segm | emp_length_segm | application_type_segm | home_ownership_segm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 1 | 4 | 1 | 2 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 2 |
| 1 | 2 | 2 | 2 | 4 | 2 | 2 | 2 | 2 | 2 | 4 | 2 | 3 | 2 | 2 | 1 |
| 2 | 1 | 3 | 1 | 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | -1 | 2 |
| 3 | 2 | 2 | 1 | 3 | 1 | 2 | 2 | 1 | 1 | 3 | 1 | 1 | 1 | 2 | 2 |
| 4 | 1 | 2 | 2 | 4 | 4 | 1 | 1 | 2 | 1 | 4 | 1 | 1 | 1 | 2 | 2 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 107859 | 3 | 3 | -1 | 4 | 1 | 2 | 1 | 1 | 1 | 2 | 1 | 2 | -1 | 2 | 2 |
| 107860 | 1 | 3 | 3 | 4 | 2 | 2 | 1 | 2 | 1 | 4 | 1 | 3 | 1 | 2 | 2 |
| 107861 | 1 | 2 | 2 | 4 | 2 | 2 | 1 | 2 | 1 | 3 | 1 | 1 | 1 | 2 | 2 |
| 107862 | 2 | 1 | 3 | 4 | 2 | 2 | 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 2 |
| 107863 | 3 | 2 | 1 | 3 | 1 | 1 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 2 | 1 |
107864 rows × 15 columns
from skopt import BayesSearchCV
from skopt.space import Real, Integer
# Define the parameter space for Bayesian optimization
hyperparameter_space = {
'n_estimators': Integer(50, 200),
'learning_rate': Real(0.001, 0.2, prior='uniform'),
'min_impurity_decrease': Real(0.001, 0.2, prior='uniform'),
'max_depth': Integer(6, 9),
'max_features': Integer(6, 9),
'min_samples_leaf': Integer(70, 150),
'min_samples_split': Integer(70, 150),
'subsample': Real(0.5, 0.8, prior='uniform')
}
# Setup BayesSearchCV
opt = BayesSearchCV(
estimator=GradientBoostingClassifier(random_state=100),
search_spaces=hyperparameter_space,
scoring='roc_auc',
n_iter=20,
cv=5,
return_train_score=True,
random_state=100
)
# Fit the BayesSearchCV
opt.fit(X_train_GB, y_train)
# Best estimator found by BayesSearchCV
best_GBc = opt.best_estimator_
# Use KFold for the cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=100)
# Perform cross-validation using 'roc_auc' metric
results = cross_validate(best_GBc, X_train_GB, y_train, cv=kf, scoring='roc_auc')
# Optionally plot the AUC results
plt.errorbar(range(len(results['test_score'])), results['test_score'], fmt='-o')
plt.ylabel('AUC Score')
plt.xlabel('Fold Number')
plt.title('GradientBoosting AUC Performance across Folds')
plt.show()
best_GBc
GradientBoostingClassifier(learning_rate=0.11594796160025214, max_depth=6,
max_features=8,
min_impurity_decrease=0.14285292256728815,
min_samples_leaf=104, min_samples_split=72,
n_estimators=71, random_state=100,
subsample=0.6381717003491763)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GradientBoostingClassifier(learning_rate=0.11594796160025214, max_depth=6,
max_features=8,
min_impurity_decrease=0.14285292256728815,
min_samples_leaf=104, min_samples_split=72,
n_estimators=71, random_state=100,
subsample=0.6381717003491763)# initialize the GradientBoosting
GradientBoosting_clf = GradientBoostingClassifier(n_estimators=71, random_state=100,
max_depth = 6,
max_features = 8,
min_impurity_decrease = 0.14285,
learning_rate = 0.11595,
min_samples_split = 72,
min_samples_leaf = 104,
subsample = 0.638
)
GradientBoosting_clf.fit(X_train_GB, y_train)
GradientBoostingClassifier(learning_rate=0.11595, max_depth=6, max_features=8,
min_impurity_decrease=0.14285, min_samples_leaf=104,
min_samples_split=72, n_estimators=71,
random_state=100, subsample=0.638)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. GradientBoostingClassifier(learning_rate=0.11595, max_depth=6, max_features=8,
min_impurity_decrease=0.14285, min_samples_leaf=104,
min_samples_split=72, n_estimators=71,
random_state=100, subsample=0.638)calculate_DiscriminatoryStats(X_train_GB, y_train, GradientBoosting_clf, 'TRAIN')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 252 1 0.0 1.0 1.0 0.0
1 255 1 0.0 1.0 1.0 0.0
2 256 1 0.0 1.0 1.0 0.0
3 257 1 0.0 1.0 1.0 0.0
4 258 1 0.0 1.0 1.0 0.0
.. ... ... ... ... ... ...
297 565 24 24.0 0.0 24.0 100901.0
298 566 20 20.0 0.0 20.0 100921.0
299 567 19 19.0 0.0 19.0 100940.0
300 568 5 5.0 0.0 5.0 100945.0
301 570 1 1.0 0.0 1.0 100946.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 1.0 0.00 0.01 0.00
1 2.0 0.00 0.01 0.00
2 3.0 0.00 0.01 0.00
3 4.0 0.00 0.01 0.00
4 5.0 0.00 0.01 0.00
.. ... ... ... ...
297 6918.0 0.02 0.00 99.96
298 6918.0 0.02 0.00 99.98
299 6918.0 0.02 0.00 99.99
300 6918.0 0.00 0.00 100.00
301 6918.0 0.00 0.00 100.00
cum_perc_bads Separation
0 0.01 -0.01
1 0.03 -0.03
2 0.04 -0.04
3 0.06 -0.06
4 0.07 -0.07
.. ... ...
297 100.00 -0.04
298 100.00 -0.02
299 100.00 -0.01
300 100.00 0.00
301 100.00 0.00
[302 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAIN data is: 49.23 AUC metric on the TRAIN data is: 0.84 Gini metric on the TRAIN data is: 0.68
calculate_DiscriminatoryStats(X_test_GB, y_test, GradientBoosting_clf, 'TEST')
Credit Score num_applicants num_goods num_bads total cum_freq_goods \
0 249 1 0.0 1.0 1.0 0.0
1 252 1 0.0 1.0 1.0 0.0
2 255 1 0.0 1.0 1.0 0.0
3 257 2 0.0 2.0 2.0 0.0
4 258 1 0.0 1.0 1.0 0.0
.. ... ... ... ... ... ...
301 566 7 7.0 0.0 7.0 40543.0
302 567 5 5.0 0.0 5.0 40548.0
303 568 1 1.0 0.0 1.0 40549.0
304 569 1 1.0 0.0 1.0 40550.0
305 570 1 1.0 0.0 1.0 40551.0
cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \
0 1.0 0.00 0.04 0.00
1 2.0 0.00 0.04 0.00
2 3.0 0.00 0.04 0.00
3 5.0 0.00 0.08 0.00
4 6.0 0.00 0.04 0.00
.. ... ... ... ...
301 2377.0 0.02 0.00 99.98
302 2377.0 0.01 0.00 99.99
303 2377.0 0.00 0.00 100.00
304 2377.0 0.00 0.00 100.00
305 2377.0 0.00 0.00 100.00
cum_perc_bads Separation
0 0.04 -0.04
1 0.08 -0.08
2 0.13 -0.13
3 0.21 -0.21
4 0.25 -0.25
.. ... ...
301 100.00 -0.02
302 100.00 -0.01
303 100.00 0.00
304 100.00 0.00
305 100.00 0.00
[306 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 44.15 AUC metric on the TEST data is: 0.82 Gini metric on the TEST data is: 0.63
calculate_and_plot_psi(X_train_GB, X_test_GB, GradientBoosting_clf, 'TRAINING', 'TEST')
The PSI statistic between TRAINING and TEST sets is: 0.152 Moderate shift in the population (PSI = 0.152)
# Calculate and compare IVs
iv_comparison_df = calculate_iv_comparison(X_train_GB, X_test_GB, y_train, y_test)
iv_comparison_df
| Variable | IV_Train | IV_Test | |
|---|---|---|---|
| 0 | total_pymnt_round_segm | 55.61 | 55.74 |
| 2 | mths_since_recent_inq_segm | 7.76 | 10.87 |
| 1 | verification_status_segm | 7.54 | 7.90 |
| 3 | inq_last_6mths_segm | 7.34 | 9.99 |
| 4 | fico_range_low_segm | 6.50 | 6.33 |
| 7 | term_segm | 4.77 | 2.90 |
| 5 | purpose_segm | 3.34 | 3.62 |
| 6 | instlmnt_round_segm | 3.04 | 2.70 |
| 9 | mort_acc_segm | 2.97 | 3.75 |
| 12 | emp_length_segm | 2.74 | 2.83 |
| 8 | open_acc_6m_segm | 2.63 | 3.67 |
| 10 | open_rv_12m_segm | 2.04 | 3.30 |
| 11 | revol_bal_segm | 1.64 | 2.66 |
| 13 | application_type_segm | 0.37 | 0.49 |
| 14 | home_ownership_segm | 0.06 | 0.13 |
import tensorflow as tf
# for the analysis of the optizimation procedure
from skopt.space import Real, Integer, Categorical
##################### Tensorflow - Keras #########################
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau
from skopt import gp_minimize
from skopt.plots import plot_convergence, plot_objective, plot_evaluations
from skopt.utils import use_named_args
The validation set will be a small part of the training set.
We will use 20% of the Training set as validation set
categorical_columns = X_train_segmented_final.select_dtypes(include=['category', 'object']).columns
categorical_columns
Index(['total_pymnt_round_segm', 'verification_status_segm',
'mths_since_recent_inq_segm', 'inq_last_6mths_segm',
'inq_last_12m_segm', 'fico_range_low_segm', 'purpose_segm',
'instlmnt_round_segm', 'acc_open_past_24mths_segm', 'term_segm',
'open_rv_24m_segm', 'open_acc_6m_segm', 'mort_acc_segm',
'open_rv_12m_segm', 'revol_bal_segm', 'emp_length_segm',
'open_il_12m_segm', 'mths_since_last_major_derog_segm',
'Annual_Inc_round_segm', 'dti_rounded_segm',
'mths_since_last_record_segm', 'mths_since_last_delinq_segm',
'delinq_2yrs_segm', 'application_type_segm', 'home_ownership_segm',
'years_with_Credit_line_segm'],
dtype='object')
X_train_segmented_final_int = X_train_segmented_final.copy()
X_test_segmented_final_int = X_test_segmented_final.copy()
for col in categorical_columns:
X_train_segmented_final_int[col] = X_train_segmented_final[col].astype(int)
X_test_segmented_final_int[col] = X_test_segmented_final[col].astype(int)
y_train_int = y_train.astype(int)
y_train_int.shape
(107864,)
X_train_nn, X_val, y_train_nn, y_val = train_test_split(
X_train_segmented_final_int,
y_train_int,
test_size=0.2,
random_state=100,
stratify = y_train_int)
X_train_nn.shape, X_val.shape
# Convert one-hot encoded targets to single integer labels if necessary
if len(y_train_nn.shape) > 1 and y_train_nn.shape[1] == 2:
y_train_nn = y_train_nn.argmax(axis=1)
y_val = y_val.argmax(axis=1)
X_train_nn.shape
(86291, 26)
X_val.shape
(21573, 26)
# Convert y_train into categorical cross entropy format
# num_classes = len(np.unique(y_train))
# y_train_nn = keras.utils.to_categorical(y_train_nn, num_classes)
# y_test_nn = keras.utils.to_categorical(y_test, num_classes)
# y_val_nn = keras.utils.to_categorical(y_val, num_classes)
# y_train_nn
Input Layer: The input shape is set to (26,) to match the 26 predictors.
Hidden Layers: Two dense layers with the specified number of nodes and activation functions.
Output Layer: A single node with a sigmoid activation function -for binary classification.
Optimizer: Adam optimizer with the specified learning rate.
Loss Function: binary_crossentropy -for binary classification problems.
Metrics: accuracy to monitor the accuracy during training and evaluation ("roc_auc" will be used)
tf.random.set_seed(100)
# Creating the model
model_opt = tf.keras.Sequential([
tf.keras.layers.Dense(16, activation=tf.keras.activations.relu, input_shape=(26,)), # Input layer, relu activation
tf.keras.layers.Dropout(0.2), # Dropout layer with a 20% dropout rate
tf.keras.layers.Dense(32, activation=tf.keras.activations.relu), # hidden layer 1, relu activation
tf.keras.layers.Dropout(0.2), # Dropout layer with a 20% dropout rate
tf.keras.layers.Dense(64, activation=tf.keras.activations.relu), # hidden layer 2, relu activation
tf.keras.layers.Dropout(0.2), # Dropout layer with a 20% dropout rate
tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid) # ouput layer, sigmoid activation
])
# Compile the model
model_opt.compile(loss=tf.keras.losses.BinaryCrossentropy(), # for binary problesms --> BinaryCrossentropy()
optimizer=tf.keras.optimizers.Adam(),
metrics=['AUC'])
# Fit the model & obbtain the history curves
history = model_opt.fit(X_train_nn, y_train_nn,
validation_data=(X_val, y_val),
epochs=50,
batch_size=16,
verbose=1)
Epoch 1/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 9s 1ms/step - AUC: 0.6185 - loss: 0.2454 - val_AUC: 0.7979 - val_loss: 0.1990 Epoch 2/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.7768 - loss: 0.2017 - val_AUC: 0.8146 - val_loss: 0.1886 Epoch 3/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.7950 - loss: 0.1878 - val_AUC: 0.8140 - val_loss: 0.1836 Epoch 4/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8041 - loss: 0.1822 - val_AUC: 0.8144 - val_loss: 0.1805 Epoch 5/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8034 - loss: 0.1812 - val_AUC: 0.8134 - val_loss: 0.1844 Epoch 6/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8033 - loss: 0.1805 - val_AUC: 0.8168 - val_loss: 0.1824 Epoch 7/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8052 - loss: 0.1795 - val_AUC: 0.8144 - val_loss: 0.1845 Epoch 8/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8075 - loss: 0.1787 - val_AUC: 0.8150 - val_loss: 0.1835 Epoch 9/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8099 - loss: 0.1785 - val_AUC: 0.8164 - val_loss: 0.1829 Epoch 10/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8122 - loss: 0.1783 - val_AUC: 0.8199 - val_loss: 0.1827 Epoch 11/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8099 - loss: 0.1790 - val_AUC: 0.8142 - val_loss: 0.1902 Epoch 12/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8119 - loss: 0.1773 - val_AUC: 0.8132 - val_loss: 0.1834 Epoch 13/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8094 - loss: 0.1783 - val_AUC: 0.8154 - val_loss: 0.1826 Epoch 14/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8105 - loss: 0.1776 - val_AUC: 0.8172 - val_loss: 0.1825 Epoch 15/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8162 - loss: 0.1765 - val_AUC: 0.8165 - val_loss: 0.1832 Epoch 16/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8134 - loss: 0.1776 - val_AUC: 0.8127 - val_loss: 0.1820 Epoch 17/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8103 - loss: 0.1774 - val_AUC: 0.8154 - val_loss: 0.1827 Epoch 18/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8133 - loss: 0.1775 - val_AUC: 0.8181 - val_loss: 0.1847 Epoch 19/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8123 - loss: 0.1775 - val_AUC: 0.8161 - val_loss: 0.1820 Epoch 20/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8141 - loss: 0.1772 - val_AUC: 0.8162 - val_loss: 0.1838 Epoch 21/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8163 - loss: 0.1764 - val_AUC: 0.8194 - val_loss: 0.1842 Epoch 22/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8149 - loss: 0.1765 - val_AUC: 0.8154 - val_loss: 0.1869 Epoch 23/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8145 - loss: 0.1774 - val_AUC: 0.8167 - val_loss: 0.1843 Epoch 24/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8154 - loss: 0.1759 - val_AUC: 0.8149 - val_loss: 0.1888 Epoch 25/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8164 - loss: 0.1766 - val_AUC: 0.8157 - val_loss: 0.1857 Epoch 26/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8151 - loss: 0.1768 - val_AUC: 0.8186 - val_loss: 0.1833 Epoch 27/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8164 - loss: 0.1759 - val_AUC: 0.8150 - val_loss: 0.1820 Epoch 28/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1762 - val_AUC: 0.8169 - val_loss: 0.1848 Epoch 29/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8180 - loss: 0.1752 - val_AUC: 0.8168 - val_loss: 0.1811 Epoch 30/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8166 - loss: 0.1771 - val_AUC: 0.8166 - val_loss: 0.1832 Epoch 31/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8155 - loss: 0.1761 - val_AUC: 0.8130 - val_loss: 0.1819 Epoch 32/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8163 - loss: 0.1761 - val_AUC: 0.8148 - val_loss: 0.1839 Epoch 33/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1760 - val_AUC: 0.8171 - val_loss: 0.1835 Epoch 34/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8193 - loss: 0.1752 - val_AUC: 0.8159 - val_loss: 0.1822 Epoch 35/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8188 - loss: 0.1755 - val_AUC: 0.8168 - val_loss: 0.1846 Epoch 36/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8176 - loss: 0.1764 - val_AUC: 0.8164 - val_loss: 0.1865 Epoch 37/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8159 - loss: 0.1761 - val_AUC: 0.8149 - val_loss: 0.1850 Epoch 38/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8213 - loss: 0.1758 - val_AUC: 0.8155 - val_loss: 0.1866 Epoch 39/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1757 - val_AUC: 0.8158 - val_loss: 0.1856 Epoch 40/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 6s 1ms/step - AUC: 0.8181 - loss: 0.1770 - val_AUC: 0.8162 - val_loss: 0.1813 Epoch 41/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8181 - loss: 0.1761 - val_AUC: 0.8154 - val_loss: 0.1811 Epoch 42/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8175 - loss: 0.1755 - val_AUC: 0.8164 - val_loss: 0.1850 Epoch 43/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8192 - loss: 0.1756 - val_AUC: 0.8168 - val_loss: 0.1808 Epoch 44/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8167 - loss: 0.1763 - val_AUC: 0.8189 - val_loss: 0.1812 Epoch 45/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8182 - loss: 0.1759 - val_AUC: 0.8156 - val_loss: 0.1846 Epoch 46/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1757 - val_AUC: 0.8161 - val_loss: 0.1825 Epoch 47/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8165 - loss: 0.1764 - val_AUC: 0.8173 - val_loss: 0.1813 Epoch 48/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8175 - loss: 0.1762 - val_AUC: 0.8144 - val_loss: 0.1835 Epoch 49/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8201 - loss: 0.1753 - val_AUC: 0.8159 - val_loss: 0.1835 Epoch 50/50 5394/5394 ━━━━━━━━━━━━━━━━━━━━ 7s 1ms/step - AUC: 0.8185 - loss: 0.1751 - val_AUC: 0.8160 - val_loss: 0.1807
# Plot the loss curves
pd.DataFrame(history.history).plot(title="Diagnostic plot - Optimized model")
<Axes: title={'center': 'Diagnostic plot - Optimized model'}>
calculate_DiscriminatoryStats_nn(X_train_segmented_final_int, y_train, model_opt, 'TRAINING')
3371/3371 ━━━━━━━━━━━━━━━━━━━━ 2s 667us/step Credit Score num_applicants num_goods num_bads total cum_freq_goods \ 0 3 248 7.0 241.0 248.0 7.0 1 5 2 0.0 2.0 2.0 7.0 2 6 2 0.0 2.0 2.0 7.0 3 8 1 0.0 1.0 1.0 7.0 4 9 3 0.0 3.0 3.0 7.0 .. ... ... ... ... ... ... 623 834 1 1.0 0.0 1.0 100942.0 624 840 1 1.0 0.0 1.0 100943.0 625 848 1 1.0 0.0 1.0 100944.0 626 860 1 1.0 0.0 1.0 100945.0 627 880 1 1.0 0.0 1.0 100946.0 cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \ 0 241.0 0.01 3.48 0.01 1 243.0 0.00 0.03 0.01 2 245.0 0.00 0.03 0.01 3 246.0 0.00 0.01 0.01 4 249.0 0.00 0.04 0.01 .. ... ... ... ... 623 6918.0 0.00 0.00 100.00 624 6918.0 0.00 0.00 100.00 625 6918.0 0.00 0.00 100.00 626 6918.0 0.00 0.00 100.00 627 6918.0 0.00 0.00 100.00 cum_perc_bads Separation 0 3.48 -3.47 1 3.51 -3.50 2 3.54 -3.53 3 3.56 -3.55 4 3.60 -3.59 .. ... ... 623 100.00 0.00 624 100.00 0.00 625 100.00 0.00 626 100.00 0.00 627 100.00 0.00 [628 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TRAINING data is: 46.46 AUC metric on the TRAINING data is: 0.83 Gini metric on the TRAINING data is: 0.65
calculate_DiscriminatoryStats_nn(X_test_segmented_final_int, y_test, model_opt, 'TEST')
1342/1342 ━━━━━━━━━━━━━━━━━━━━ 1s 568us/step Credit Score num_applicants num_goods num_bads total cum_freq_goods \ 0 2 137 1.0 136.0 137.0 1.0 1 3 3 0.0 3.0 3.0 1.0 2 5 1 0.0 1.0 1.0 1.0 3 6 3 0.0 3.0 3.0 1.0 4 12 1 0.0 1.0 1.0 1.0 .. ... ... ... ... ... ... 554 797 1 1.0 0.0 1.0 40546.0 555 814 1 1.0 0.0 1.0 40547.0 556 834 1 1.0 0.0 1.0 40548.0 557 848 1 1.0 0.0 1.0 40549.0 558 880 2 2.0 0.0 2.0 40551.0 cum_freq_bads perc_total_goods perc_total_bads cum_perc_goods \ 0 136.0 0.0 5.72 0.00 1 139.0 0.0 0.13 0.00 2 140.0 0.0 0.04 0.00 3 143.0 0.0 0.13 0.00 4 144.0 0.0 0.04 0.00 .. ... ... ... ... 554 2377.0 0.0 0.00 99.99 555 2377.0 0.0 0.00 99.99 556 2377.0 0.0 0.00 99.99 557 2377.0 0.0 0.00 100.00 558 2377.0 0.0 0.00 100.00 cum_perc_bads Separation 0 5.72 -5.72 1 5.85 -5.85 2 5.89 -5.89 3 6.02 -6.02 4 6.06 -6.06 .. ... ... 554 100.00 -0.01 555 100.00 -0.01 556 100.00 -0.01 557 100.00 0.00 558 100.00 0.00 [559 rows x 12 columns]
The Kolmogorov-Smirnov statistic on the TEST data is: 44.02 AUC metric on the TEST data is: 0.81 Gini metric on the TEST data is: 0.63
calculate_and_plot_psi_nn(X_train_nn, X_test_segmented_final_int, model_opt, 'Training', 'Test')
2697/2697 ━━━━━━━━━━━━━━━━━━━━ 2s 577us/step 1342/1342 ━━━━━━━━━━━━━━━━━━━━ 1s 574us/step
The PSI statistic between Training and Test sets is: 0.145 Moderate shift in the population (PSI = 0.145)